This is the 4th version of this notebook. All prior versions are saved for debugging and the inherent historical value. However, this is the only notebook that should be used.

In [20]:
from pathlib import Path
import pandas as pd
import enchant
from tqdm import tqdm

def print_step(step_number: int, message: str) -> None:
    tqdm.write(f"[Step {step_number}] {message}")


print_step(1, "Cleaning DataFrame")

[Step 1] Cleaning DataFrame


In [21]:
INPUT_PATH = Path(r"C:\\Users\\joshu\\OneDrive - Georgia State University\\Dojo\\Projects\\Work\\Ruminations\\project1001\\data\\ocr_dicts.xlsx")
NUM_WORD_COLS = 3         
DICT_LOCALE = "en_US"    

print_step(2, "Config Loaded")

[Step 2] Config Loaded


In [22]:
print_step(3, f"Loading spreadsheet from {INPUT_PATH}...")

if INPUT_PATH.suffix.lower() in {".xlsx", ".xls"}:
    df = pd.read_excel(INPUT_PATH)
else:
    df = pd.read_csv(INPUT_PATH)

# Normalize whitespace on object columns
for c in df.columns:
    if pd.api.types.is_object_dtype(df[c]):
        df[c] = df[c].astype(str).str.strip().replace({"nan": ""})

# Pick the first N columns as word columns
word_cols = list(df.columns[:NUM_WORD_COLS])

# Friendly info (explicit col_a/b/c like you wanted)
col_a, col_b, col_c = word_cols[0], word_cols[1], word_cols[2]
print_step(4, f"Analyzing columns: {col_a}, {col_b}, {col_c} + Total rows: {len(df)} ")


[Step 3] Loading spreadsheet from C:\Users\joshu\OneDrive - Georgia State University\Dojo\Projects\Work\Ruminations\project1001\data\ocr_dicts.xlsx...
[Step 4] Analyzing columns: Gemini, ChatGPT, Claude + Total rows: 1011 


In [23]:
d = enchant.Dict(DICT_LOCALE)

def is_dict_entry(text: str) -> bool:
    """
    True if the entire (trimmed) cell is recognized by the dictionary.
    Multi-word entries (e.g., 'Avant garde') are allowed if Enchant knows them.
    """
    if not isinstance(text, str):
        return False
    w = text.strip()
    if not w:
        return False
    # be forgiving about casing
    return d.check(w) or d.check(w.lower()) or d.check(w.capitalize())

print_step(5, f"Dictionary initialized for locale: {DICT_LOCALE}")


[Step 5] Dictionary initialized for locale: en_US


In [24]:
print_step(6, "Validating & deduplicating words...")

seen_lower = set()      # case-insensitive dedupe
unique_words = []       # preserve first-seen surface form
invalid_rows = []       # for an audit sheet (RowIndex, Column, Value)

for col in tqdm(word_cols, desc="Columns", unit="col"):
    series = df[col].fillna("").astype(str)
    for i, raw in tqdm(series.items(), total=len(series), desc=f"{col}", unit="cells", leave=False):
        w = raw.strip()
        if not w:
            continue
        if is_dict_entry(w):
            key = w.lower()
            if key not in seen_lower:
                seen_lower.add(key)
                unique_words.append(w)
        else:
            invalid_rows.append({"RowIndex": i, "Column": col, "Value": w})

print_step(7, f"Kept {len(unique_words)} unique dictionary entries. Flagged {len(invalid_rows)} invalid entries.")

[Step 6] Validating & deduplicating words...


Columns: 100%|██████████| 3/3 [00:00<00:00,  3.29col/s]

[Step 7] Kept 982 unique dictionary entries. Flagged 94 invalid entries.





In [25]:
print_step(8, "Saving outputs next to the input file...")

out_dir = INPUT_PATH.parent

# 1) One word per row (your requested primary artifact)
unique_words_df = pd.DataFrame({"Word": unique_words})
one_per_row_csv = out_dir / "unique_words.csv"
unique_words_df.to_csv(one_per_row_csv, index=False)

# 2) Single row CSV (optional bonus format)
one_row_csv = out_dir / "unique_dictionary_words.csv"
pd.DataFrame([unique_words]).to_csv(one_row_csv, index=False, header=False)

# 3) Invalids audit sheet
invalid_xlsx = out_dir / "invalid_words.xlsx"
if len(invalid_rows) > 0:
    pd.DataFrame(invalid_rows).to_excel(invalid_xlsx, index=False)
else:
    # create an empty file with headers so it's obvious the script ran
    pd.DataFrame(columns=["RowIndex", "Column", "Value"]).to_excel(invalid_xlsx, index=False)

print_step(8, "Saved:")
tqdm.write(f"       • {one_per_row_csv}")
tqdm.write(f"       • {one_row_csv}")
tqdm.write(f"       • {invalid_xlsx}")
print_step(9, "Processing complete.")


[Step 8] Saving outputs next to the input file...
[Step 8] Saved:
       • C:\Users\joshu\OneDrive - Georgia State University\Dojo\Projects\Work\Ruminations\project1001\data\unique_words.csv
       • C:\Users\joshu\OneDrive - Georgia State University\Dojo\Projects\Work\Ruminations\project1001\data\unique_dictionary_words.csv
       • C:\Users\joshu\OneDrive - Georgia State University\Dojo\Projects\Work\Ruminations\project1001\data\invalid_words.xlsx
[Step 9] Processing complete.
