In [None]:
# save as dedupe_latest.py
from pathlib import Path
import pandas as pd
from fuzzywuzzy import fuzz

# Set your folders here
DATA_DIR = "/Users/laksh/Desktop/applications_data"
OUTPUT_DIR = "/Users/laksh/Desktop/applications_data/dupe_reports"
THRESHOLD = 90

def latest_csv(dir_path: str) -> Path:
    p = Path(dir_path)
    files = sorted(p.glob("*.csv"), key=lambda f: f.stat().st_mtime, reverse=True)
    if not files:
        raise FileNotFoundError(f"No CSV files found in {p.resolve()}")
    return files[0]

def find_name_duplicates(df, threshold=90):
    names = df["Name"].fillna("").tolist()
    matches = []
    for i, name in enumerate(names):
        for j, other in enumerate(names[i+1:]):
            score = fuzz.token_sort_ratio(name, other)
            if score > threshold:
                print((df["Name"].iat[i], df["Name"].iat[i+1+j]))
                print(score)
                matches.append((i, i+1+j))
    return matches

def main():
    src = latest_csv(DATA_DIR)
    print(f"using file: {src}")

    df2 = pd.read_csv(src)

    # standardize
    if "Email" in df2 and "Phone Number" in df2:
        df2["Email"] = df2["Email"].astype(str).str.lower().str.strip()
        df2["Phone Number"] = df2["Phone Number"].astype(str).str.replace(r"\D", "", regex=True)

    needed = ["Email", "Phone Number", "Age", "Gender"]
    missing = [c for c in needed if c not in df2.columns]
    if missing:
        print(f"missing columns: {missing}  skipping exact duplicate check.")
        exact_dupes = pd.DataFrame()
    else:
        exact_dupes = df2[df2.duplicated(subset=needed, keep=False)]

    name_dupes = find_name_duplicates(df2, threshold=THRESHOLD)

    print(f"Exact duplicates: {len(exact_dupes)}")
    print(f"Potential name duplicates: {len(name_dupes)}")

    # make sure output folder exists
    out_dir = Path(OUTPUT_DIR)
    out_dir.mkdir(parents=True, exist_ok=True)

    # write exact dupes report to the new location
    out_exact = out_dir / f"{Path(src).stem}_dupes_report.csv"
    exact_dupes.to_csv(out_exact, index=False)
    print(f"saved exact dupes to: {out_exact}")

    # write fuzzy name matches as a simple table too
    if name_dupes:
        rows = []
        for i, j in name_dupes:
            n1 = str(df2["Name"].iat[i])
            n2 = str(df2["Name"].iat[j])
            score = fuzz.token_sort_ratio(n1, n2)
            rows.append({"index_1": i, "index_2": j, "name_1": n1, "name_2": n2, "score": score})
        out_names = out_dir / f"{Path(src).stem}_name_dupes.csv"
        pd.DataFrame(rows).to_csv(out_names, index=False)
        print(f"saved name matches to: {out_names}")

if __name__ == "__main__":
    main()
