# Module 3 — Thème 2\n\nDoublons, clés, unicité, déduplication contrôlée.\n\nFichiers attendus en sortie :\n- m3t2_key_audit.csv\n- m3t2_duplicates_report.csv\n- m3t2_dataset_dedup.csv\n- m3t2_dedup_audit.csv\n- m3t2_quality_report.json\n- m3t2_dedup_rules.md\n

In [None]:
import pandas as pd, json\nfrom datetime import datetime\n\ndf = pd.read_csv("users_identity_messy.csv")\n\n# -----------------------------\n# 1) Nettoyage minimal (avant audit)\n# -----------------------------\ndef clean_str(s: pd.Series):\n    s = s.astype("string")\n    s = s.str.strip().str.replace(r"\\s+", " ", regex=True)\n    return s\n\nfor c in ["user_id", "email", "phone", "country", "channel"]:\n    df[c] = clean_str(df[c])\n\n# normalisations utiles\ndf["email"] = df["email"].replace({"": pd.NA}).str.lower()\ndf["phone"] = df["phone"].replace({"": pd.NA})\ndf["country"] = df["country"].replace({"": pd.NA}).str.title()\ndf["channel"] = df["channel"].replace({"": pd.NA}).str.lower()\n\ndf["signup_date"] = pd.to_datetime(df["signup_date"], errors="coerce", dayfirst=True, utc=True)\ndf["last_active"] = pd.to_datetime(df["last_active"], errors="coerce", dayfirst=True, utc=True)\ndf["revenue"] = pd.to_numeric(df["revenue"], errors="coerce")\ndf.loc[df["revenue"] < 0, "revenue"] = pd.NA\n\n# -----------------------------\n# 2) Audit des clés candidates\n# -----------------------------\ncandidates = {\n    "user_id": ["user_id"],\n    "email": ["email"],\n    "phone": ["phone"],\n    "email_or_phone": None,  # calculée après\n}\n\ntmp = df.copy()\ntmp["email_or_phone"] = tmp["email"].fillna(tmp["phone"])\n\naudit_rows = []\nfor name, cols in candidates.items():\n    if cols is None:\n        key = tmp["email_or_phone"]\n        n_unique = int(key.nunique(dropna=True))\n        dup_rows = int(key.duplicated(keep=False).sum())\n    else:\n        n_unique = int(tmp[cols].drop_duplicates().shape[0])\n        dup_rows = int(tmp.duplicated(subset=cols, keep=False).sum())\n\n    audit_rows.append({\n        "candidate_key": name,\n        "n_rows": int(len(tmp)),\n        "n_unique": n_unique,\n        "duplicate_rows": dup_rows,\n        "duplicate_rate_pct": round((dup_rows / len(tmp)) * 100, 2),\n    })\n\naudit_df = pd.DataFrame(audit_rows).sort_values("duplicate_rate_pct", ascending=True)\naudit_df.to_csv("m3t2_key_audit.csv", index=False)\n\n# -----------------------------\n# 3) Rapport doublons (résumé)\n# -----------------------------\nreport = []\n\n# Doublons exacts (lignes identiques)\nexact_dupes = int(tmp.duplicated(keep=False).sum())\nreport.append({"type": "exact_row_duplicates", "key": "ALL_COLUMNS", "duplicate_rows": exact_dupes})\n\n# Doublons par colonnes\nfor k, cols in [("user_id", ["user_id"]), ("email", ["email"]), ("phone", ["phone"])]:\n    dup_rows = int(tmp.duplicated(subset=cols, keep=False).sum())\n    report.append({"type": "key_duplicates", "key": k, "duplicate_rows": dup_rows})\n\n# Doublons identité (email_or_phone)\ndup_rows_entity = int(tmp["email_or_phone"].duplicated(keep=False).sum())\nreport.append({"type": "entity_duplicates", "key": "email_or_phone", "duplicate_rows": dup_rows_entity})\n\npd.DataFrame(report).to_csv("m3t2_duplicates_report.csv", index=False)\n\n# -----------------------------\n# 4) Déduplication contrôlée\n#    - clé entité = email si présent sinon phone sinon user_id\n#    - garder: plus complet, puis plus récent\n# -----------------------------\nwork = tmp.copy()\n\nwork["entity_id"] = work["email"].fillna(work["phone"]).fillna(work["user_id"])\n\n# score de complétude (combien de champs utiles non manquants)\nuseful_cols = ["country", "channel", "signup_date", "last_active", "revenue", "email", "phone"]\nwork["completeness_score"] = work[useful_cols].notna().sum(axis=1)\n\n# tri: entity_id, completeness desc, last_active desc\nwork = work.sort_values(\n    by=["entity_id", "completeness_score", "last_active"],\n    ascending=[True, False, False],\n)\n\n# marquer la ligne gardée\nwork["is_kept"] = ~work.duplicated(subset=["entity_id"], keep="first")\n\nkept = work[work["is_kept"]].drop(columns=["completeness_score", "is_kept"])\ndropped = work[~work["is_kept"]].copy()\n\n# audit des suppressions\naudit = dropped[["entity_id", "user_id", "email", "phone", "last_active", "country", "channel", "revenue"]].copy()\naudit["reason"] = "duplicate_entity_id_keep_best(completeness, last_active)"\naudit.to_csv("m3t2_dedup_audit.csv", index=False)\n\n# dataset final dédupliqué\nkept.to_csv("m3t2_dataset_dedup.csv", index=False)\n\n# -----------------------------\n# 5) Rapport qualité\n# -----------------------------\nquality = {\n  "created_at": datetime.utcnow().isoformat()+"Z",\n  "rows_before": int(len(tmp)),\n  "rows_after": int(len(kept)),\n  "dropped_rows": int(len(tmp) - len(kept)),\n  "unique_entity_id": int(kept["entity_id"].nunique(dropna=True)),\n  "checks": {\n    "entity_id_unique": bool(kept["entity_id"].is_unique),\n    "rows_ge_200": bool(len(kept) >= 200),\n    "dropped_at_least_one": bool((len(tmp) - len(kept)) >= 1),\n  },\n}\nwith open("m3t2_quality_report.json", "w", encoding="utf-8") as f:\n    json.dump(quality, f, ensure_ascii=False, indent=2)\n\nrules = f"""# Dedup Rules — Module 3 Theme 2\n\n## Entity key\nentity_id = email if present else phone if present else user_id\n\n## Pre-cleaning\n- email: strip + lowercase + empty -> NA\n- country: strip + Title Case + empty -> NA\n- channel: strip + lowercase + empty -> NA\n- dates: parsed to UTC where possible\n\n## Selection rule (per entity_id)\n1) Keep row with highest completeness_score (non-missing among: {useful_cols})\n2) If tie, keep most recent last_active\n3) Else stable first after sort\n\n## Evidence exports\n- m3t2_key_audit.csv\n- m3t2_duplicates_report.csv\n- m3t2_dedup_audit.csv (dropped rows)\n- m3t2_dataset_dedup.csv (final)\n"""\nopen("m3t2_dedup_rules.md", "w", encoding="utf-8").write(rules)\n\nprint("✅ Exports generated (audit/report/dedup/quality/rules).")\n