# Module 2 — Thème 2 (CSV/Excel + Power Query)

## Objectif
À partir de fichiers bruts volontairement "sales" :
- produire un dataset propre **exactement** avec ces 6 colonnes : `user_id,event_time,event_type,theme,country,channel`
- générer un rapport qualité JSON
- générer un data dictionary (MD)

## Fichiers attendus (inputs)
- `raw_events_messy.csv`
- `raw_profiles_messy.xlsx`

## Fichiers à produire (outputs)
- `m2t2_clean_learning_dataset.csv`
- `m2t2_quality_report.json`
- `m2t2_data_dictionary.md`

## Important (Power Query)
- Crée aussi un fichier `m2t2_powerquery.m` et colle ton script M (Advanced Editor).
- Crée `m2t2_refresh_notes.md` (5–10 lignes) : comment refresh + risques + locale choisie.


In [None]:
import os
import json
import pandas as pd
from datetime import datetime

print("✅ Imports OK")

In [None]:
# ---------- Robust CSV reader (separator + encoding) ----------
def read_csv_robust(path: str) -> pd.DataFrame:
    attempts = [
        {"sep": ";", "encoding": "utf-8"},
        {"sep": ",", "encoding": "utf-8"},
        {"sep": ";", "encoding": "latin1"},
        {"sep": ",", "encoding": "latin1"},
    ]
    last_err = None
    for a in attempts:
        try:
            df = pd.read_csv(path, **a)
            if df.shape[1] >= 3:
                return df
        except Exception as e:
            last_err = e
    raise last_err

events = read_csv_robust("raw_events_messy.csv")
profiles = pd.read_excel("raw_profiles_messy.xlsx", sheet_name=0)
print("Loaded:", events.shape, profiles.shape)

In [None]:
# ---------- Standardize column names ----------
events.columns = [str(c).strip().lower() for c in events.columns]
profiles.columns = [str(c).strip().lower() for c in profiles.columns]

# tolerant aliasing if needed
aliases = {
    "userid": "user_id",
    "user": "user_id",
    "time": "event_time",
    "timestamp": "event_time",
    "event": "event_type",
}
for k, v in aliases.items():
    if k in events.columns and v not in events.columns:
        events.rename(columns={k: v}, inplace=True)

print("Events columns:", list(events.columns))
print("Profiles columns:", list(profiles.columns))

In [None]:
# ---------- Cleaning helpers ----------
def clean_str(s: pd.Series) -> pd.Series:
    s = s.astype(str)
    s = s.str.replace("\u00A0", " ", regex=False)  # non-breaking space
    s = s.str.strip()
    s = s.str.replace(r"\s+", " ", regex=True)
    s = s.replace("nan", "")
    return s

def normalize_country(s: pd.Series) -> pd.Series:
    s = clean_str(s)
    s = s.str.title()
    s = s.replace({"Benin": "Bénin"})
    return s

def normalize_channel(s: pd.Series) -> pd.Series:
    s = clean_str(s).str.lower()
    s = s.replace({"cellulaire": "mobile", "smartphone": "mobile"})
    return s

def normalize_event_type(s: pd.Series) -> pd.Series:
    s = clean_str(s).str.lower()
    s = s.str.replace(" ", "_", regex=False)
    s = s.str.replace("-", "_", regex=False)
    return s

In [None]:
# ---------- Ensure expected columns exist ----------
for col in ["user_id", "event_time", "event_type", "theme", "country", "channel"]:
    if col not in events.columns:
        events[col] = ""

# Apply cleaning
events["user_id"] = clean_str(events["user_id"])
events["event_type"] = normalize_event_type(events["event_type"])
events["country"] = normalize_country(events["country"])
events["channel"] = normalize_channel(events["channel"])

# theme: coerce numeric
events["theme"] = pd.to_numeric(events["theme"], errors="coerce").astype("Int64")

# event_time: robust parse (dayfirst common in FR)
events["event_time"] = pd.to_datetime(events["event_time"], errors="coerce", dayfirst=True, utc=True)

events.head(10)

In [None]:
# ---------- Enrich from profiles (optional) ----------
if "user_id" in profiles.columns:
    profiles["user_id"] = clean_str(profiles["user_id"])
    if "country" in profiles.columns:
        profiles["country"] = normalize_country(profiles["country"])
    if "channel" in profiles.columns:
        profiles["channel"] = normalize_channel(profiles["channel"])

    keep_cols = [c for c in ["user_id", "country", "channel"] if c in profiles.columns]
    prof_u = profiles[keep_cols].drop_duplicates("user_id")
    events = events.merge(prof_u, on="user_id", how="left", suffixes=("", "_profile"))

    if "country_profile" in events.columns:
        events["country"] = events["country"].mask(events["country"].eq("") | events["country"].isna(), events["country_profile"])
    if "channel_profile" in events.columns:
        events["channel"] = events["channel"].mask(events["channel"].eq("") | events["channel"].isna(), events["channel_profile"])

    events.drop(columns=[c for c in ["country_profile", "channel_profile"] if c in events.columns], inplace=True)

print("After enrich:", events.shape)

In [None]:
# ---------- Quality checks ----------
expected_cols = ["user_id", "event_time", "event_type", "theme", "country", "channel"]

report = {"created_at": datetime.utcnow().isoformat() + "Z", "checks": {}}
report["checks"]["rows_after_load"] = int(len(events))
report["checks"]["has_expected_columns"] = bool(all(c in events.columns for c in expected_cols))

for c in expected_cols:
    report["checks"][f"missing_{c}"] = int(events[c].isna().sum()) if c in events.columns else None

report["checks"]["duplicate_rows"] = int(events[expected_cols].duplicated().sum())

dt_min = events["event_time"].min()
dt_max = events["event_time"].max()
report["checks"]["event_time_min"] = None if pd.isna(dt_min) else dt_min.isoformat()
report["checks"]["event_time_max"] = None if pd.isna(dt_max) else dt_max.isoformat()

report["checks"]["event_type_top10"] = events["event_type"].value_counts(dropna=False).head(10).to_dict()

report

In [None]:
# ---------- Build clean dataset (exact schema) ----------
clean = events[expected_cols].copy()
clean = clean.dropna(subset=["user_id", "event_time", "event_type"])  # minimal integrity
clean = clean[clean["user_id"].astype(str).str.len() > 0]

clean["user_id"] = clean["user_id"].astype(str)
clean["event_type"] = clean["event_type"].astype(str)
clean["country"] = clean["country"].astype(str)
clean["channel"] = clean["channel"].astype(str)

clean.to_csv("m2t2_clean_learning_dataset.csv", index=False)
print("✅ Exported m2t2_clean_learning_dataset.csv", clean.shape)

In [None]:
# ---------- Export quality report JSON ----------
with open("m2t2_quality_report.json", "w", encoding="utf-8") as f:
    json.dump(report, f, ensure_ascii=False, indent=2)

print("✅ Exported m2t2_quality_report.json")

In [None]:
# ---------- Data dictionary markdown ----------
dict_lines = []
dict_lines.append("# Data Dictionary — Module 2 / Theme 2\n\n")
dict_lines.append("## Schéma final (m2t2_clean_learning_dataset.csv)\n")
dict_lines.append("- **user_id**: identifiant apprenant (string)\n")
dict_lines.append("- **event_time**: date/heure événement (UTC, ISO)\n")
dict_lines.append("- **event_type**: type normalisé (ex: enrolled, opened_theme, opened_notebook, submitted, validated)\n")
dict_lines.append("- **theme**: numéro thème (int)\n")
dict_lines.append("- **country**: pays standardisé (Title Case)\n")
dict_lines.append("- **channel**: canal standardisé (lowercase)\n\n")
dict_lines.append("## Notes qualité (extrait)\n")
dict_lines.append(f"- rows_after_load: {report['checks'].get('rows_after_load')}\n")
dict_lines.append(f"- duplicate_rows: {report['checks'].get('duplicate_rows')}\n")
dict_lines.append(f"- event_time_min: {report['checks'].get('event_time_min')}\n")
dict_lines.append(f"- event_time_max: {report['checks'].get('event_time_max')}\n")

with open("m2t2_data_dictionary.md", "w", encoding="utf-8") as f:
    f.writelines(dict_lines)

print("✅ Exported m2t2_data_dictionary.md")

In [None]:
# ---------- Power Query M script presence check ----------
m_path = "m2t2_powerquery.m"
if not os.path.exists(m_path):
    print("⚠️ m2t2_powerquery.m absent. Crée le fichier et colle ton script M depuis Power Query (Advanced Editor).")
else:
    content = open(m_path, "r", encoding="utf-8", errors="ignore").read().strip()
    if len(content) < 30:
        print("⚠️ m2t2_powerquery.m présent mais trop court/vide. Colle le script M complet.")
    else:
        print("✅ m2t2_powerquery.m OK (non vide).")