# Module 2 — Theme 1 — Source inventory + data mapping + plan de collecte\n\nCe notebook génère les preuves obligatoires :\n- `m2t1_inventory_filled.csv`\n- `m2t1_data_mapping.md`\n- `m2t1_collection_plan.md`\n- `m2t1_quality_checks.json`\n\nFichiers d'entrée attendus (dans le même dossier que le notebook) :\n- `source_inventory_template.csv`\n- `data_requirements_template.csv`\n

In [None]:
import pandas as pd\nimport json\nfrom datetime import datetime\n\ninv = pd.read_csv("source_inventory_template.csv")\nreq = pd.read_csv("data_requirements_template.csv")\n\n# --- Minimal validation rules ---\nrequired_inv_cols = {\n  "source_name","type","owner","access","refresh","grain",\n  "key_fields","coverage","known_issues","privacy","linked_kpis"\n}\nmissing = required_inv_cols - set(inv.columns)\nif missing:\n    raise ValueError(f"Missing inventory columns: {missing}")\n\nif len(inv) < 8:\n    raise ValueError(f"Inventory must have >= 8 sources. Current: {len(inv)}")\n\nif len(req) < 10:\n    raise ValueError(f"Data requirements must have >= 10 rows. Current: {len(req)}")\n\nif inv["source_name"].duplicated().any():\n    raise ValueError("Duplicate source_name in inventory.")\n\nif inv["key_fields"].fillna("").str.len().eq(0).any():\n    raise ValueError("Some sources have empty key_fields.")\n\n# --- Export inventory ---\ninv.to_csv("m2t1_inventory_filled.csv", index=False)\n\n# --- Data mapping markdown ---\nmapping_lines = []\nmapping_lines.append("# Data mapping — Module 2 / Theme 1\n")\nmapping_lines.append("\n## Sources inventory\n")\nfor _, r in inv.iterrows():\n    mapping_lines.append(\n        f"- **{r['source_name']}** ({r['type']}) — grain: {r['grain']} — key_fields: `{r['key_fields']}`"\n    )\n\nmapping_lines.append("\n## Proposed joins (minimal)\n")\nmapping_lines.append("- `platform_events.user_id` -> `validations.user_id`")\nmapping_lines.append("- `platform_events.user_id` -> `profile.user_id`")\nmapping_lines.append("- `platform_events.user_id` -> `marketing.user_id`")\nmapping_lines.append("- `platform_events.user_id` -> `support_tickets.user_id`")\n\nmapping_lines.append("\n## Join risks to check\n")\nmapping_lines.append("- Verify `user_id` type consistency across sources (string/int).")\nmapping_lines.append("- Check missing `user_id` rates and orphan records after joins.")\nmapping_lines.append("- Confirm grain before aggregations (event vs user vs ticket).")\n\nwith open("m2t1_data_mapping.md", "w", encoding="utf-8") as f:\n    f.write("\n".join(mapping_lines))\n\n# --- Collection plan markdown ---\nplan = []\nplan.append("# Data collection plan — Module 2 / Theme 1\n")\nplan.append("\n## Goal\n")\nplan.append("- Build a reproducible source inventory + mapping + collection plan.\n")\nplan.append("\n## Steps (recommended order)\n")\nplan.append("1) Confirm access owners + credentials (SQL/API/exports)\n")\nplan.append("2) Extract priority sources (platform_events, validations)\n")\nplan.append("3) Run express quality checks (missing, duplicates, date ranges)\n")\nplan.append("4) Extract segmentation sources (profile, marketing)\n")\nplan.append("5) Extract guardrail sources (support_tickets)\n")\nplan.append("6) Store extracts in a date-stamped folder (versioning)\n")\nplan.append("7) Document assumptions + limitations\n")\nplan.append("\n## Storage convention\n")\nplan.append("- `extracts/YYYY-MM-DD/<source_name>.<ext>`\n")\nplan.append("- `extracts/YYYY-MM-DD/README_extraction.md`\n")\nplan.append("\n## Validation\n")\nplan.append("- Inventory rows >= 8\n")\nplan.append("- Requirements rows >= 10\n")\nplan.append("- Mapping includes 4+ joins\n")\n\nwith open("m2t1_collection_plan.md", "w", encoding="utf-8") as f:\n    f.write("\n".join(plan))\n\n# --- Quick quality checks summary ---\nqc = {\n  "created_at": datetime.utcnow().isoformat() + "Z",\n  "inventory_rows": int(len(inv)),\n  "requirements_rows": int(len(req)),\n  "checks": [\n    {"name": "inventory_min_8_sources", "ok": len(inv) >= 8},\n    {"name": "requirements_min_10_fields", "ok": len(req) >= 10},\n    {"name": "unique_source_name", "ok": bool(~inv["source_name"].duplicated().any())},\n    {"name": "has_key_fields", "ok": bool(inv["key_fields"].fillna("").str.len().gt(0).all())}\n  ]\n}\n\nwith open("m2t1_quality_checks.json", "w", encoding="utf-8") as f:\n    json.dump(qc, f, ensure_ascii=False, indent=2)\n\nprint("✅ Exports generated:",\n      "m2t1_inventory_filled.csv, m2t1_data_mapping.md, m2t1_collection_plan.md, m2t1_quality_checks.json")\n