# Thème 3 — KPI Dictionary + Calcul KPI (Notebook obligatoire)

## Objectif
- Construire un dictionnaire KPI (6 KPI minimum)
- Valider la cohérence (colonnes requises, pas de doublons)
- Calculer au moins 2 KPI sur le dataset (KPI principal + guardrail)
- Exporter 3 fichiers obligatoires :
  - `theme3_kpi_dictionary.csv`
  - `theme3_kpi_values.json`
  - `theme3_guardrail_analysis.md`

## Dataset
- Par défaut : `public/datasets/data-analyst/module-1/koryxa_learning_events.csv`
- Si tu exécutes en local : assure-toi d’avoir le fichier (ou ajuste le chemin)


In [None]:
import pandas as pd
import json
from pathlib import Path

# --- Paths (robuste) ---
# Notebook: public/notebooks/data-analyst/module-1/
# Dataset : public/datasets/data-analyst/module-1/
DEFAULT_DATA_PATH = Path("../../../datasets/data-analyst/module-1/koryxa_learning_events.csv")

if DEFAULT_DATA_PATH.exists():
    data_path = DEFAULT_DATA_PATH
elif Path("koryxa_learning_events.csv").exists():
    data_path = Path("koryxa_learning_events.csv")
else:
    raise FileNotFoundError(
        "Dataset not found. Expected at '../../../datasets/data-analyst/module-1/koryxa_learning_events.csv' "
        "or in the current folder as 'koryxa_learning_events.csv'."
    )

df = pd.read_csv(data_path, parse_dates=["event_time"])
print("✅ Loaded:", data_path)
df.head()

## 1) Contrôle minimum dataset
Le dataset doit contenir au minimum :
- `user_id`
- `event_type`
- `event_time`


In [None]:
required = {"user_id", "event_type", "event_time"}
missing = required - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns in dataset: {missing}")

print("✅ Dataset columns OK")
print("Rows:", len(df))
print("Event types:", df["event_type"].value_counts().head(10).to_dict())

## 2) KPI Dictionary template (6 KPI minimum)
Champs requis :
- kpi_name, definition, objective, formula, unit, granularity, segment,
  source, owner, refresh, guardrail, gaming_risk, controls


In [None]:
kpi_dict = pd.DataFrame([
  {
    "kpi_name": "completion_rate_m1",
    "definition": "% des inscrits ayant validé le module 1",
    "objective": "Augmenter la complétion du Module 1",
    "formula": "validated_users / enrolled_users",
    "unit": "ratio",
    "granularity": "cohort_week",
    "segment": "new_enrollments",
    "source": "koryxa_learning_events.csv",
    "owner": "KORYXA",
    "refresh": "daily",
    "guardrail": "support_ticket_rate",
    "gaming_risk": "validation trop facile / clics rapides",
    "controls": "surveiller tickets + exiger preuve notebook"
  },
  {
    "kpi_name": "support_ticket_rate",
    "definition": "Tickets support par inscrit",
    "objective": "Garde-fou qualité/support",
    "formula": "support_tickets / enrolled_users",
    "unit": "tickets_per_user",
    "granularity": "week",
    "segment": "new_enrollments",
    "source": "koryxa_learning_events.csv",
    "owner": "Support",
    "refresh": "daily",
    "guardrail": "",
    "gaming_risk": "tickets fermés sans résolution",
    "controls": "audit résolution + taux réouverture"
  },
  {
    "kpi_name": "notebook_opened_48h_rate",
    "definition": "% des inscrits qui ouvrent le notebook sous 48h",
    "objective": "Activer tôt les apprenants",
    "formula": "users_opened_notebook_48h / enrolled_users",
    "unit": "ratio",
    "granularity": "cohort_week",
    "segment": "new_enrollments",
    "source": "koryxa_learning_events.csv",
    "owner": "Produit",
    "refresh": "daily",
    "guardrail": "support_ticket_rate",
    "gaming_risk": "ouverture sans exécution",
    "controls": "ajouter event notebook_executed + vérifier progression"
  },
  {
    "kpi_name": "theme2_completion_rate",
    "definition": "% des inscrits qui terminent le thème 2",
    "objective": "Identifier la chute principale",
    "formula": "users_completed_theme2 / enrolled_users",
    "unit": "ratio",
    "granularity": "cohort_week",
    "segment": "new_enrollments",
    "source": "koryxa_learning_events.csv",
    "owner": "KORYXA",
    "refresh": "daily",
    "guardrail": "support_ticket_rate",
    "gaming_risk": "marquer terminé sans lecture réelle",
    "controls": "valider avec quiz + temps minimum"
  },
  {
    "kpi_name": "median_time_to_theme2_hours",
    "definition": "Temps médian (h) entre inscription et 1ère vue du thème 2",
    "objective": "Réduire la friction de démarrage",
    "formula": "median(event_time_first_theme2 - event_time_enrolled)",
    "unit": "hours",
    "granularity": "cohort_week",
    "segment": "new_enrollments",
    "source": "koryxa_learning_events.csv",
    "owner": "Produit",
    "refresh": "daily",
    "guardrail": "",
    "gaming_risk": "événements mal horodatés",
    "controls": "contrôle timezone + nettoyage doublons"
  },
  {
    "kpi_name": "project_submission_rate_m1",
    "definition": "% des inscrits qui soumettent le mini-projet M1",
    "objective": "Confirmer la qualité par l’action",
    "formula": "users_submitted_project_m1 / enrolled_users",
    "unit": "ratio",
    "granularity": "cohort_week",
    "segment": "new_enrollments",
    "source": "koryxa_learning_events.csv",
    "owner": "KORYXA",
    "refresh": "daily",
    "guardrail": "support_ticket_rate",
    "gaming_risk": "soumissions vides",
    "controls": "exiger template + review"
  }
])
kpi_dict

## 3) Validation dictionnaire (schéma minimum)


In [None]:
required_cols = {
  "kpi_name","definition","objective","formula","unit","granularity","segment",
  "source","owner","refresh","guardrail","gaming_risk","controls"
}
missing = required_cols - set(kpi_dict.columns)
if missing:
    raise ValueError(f"Missing KPI dictionary columns: {missing}")

if kpi_dict["kpi_name"].duplicated().any():
    dupes = kpi_dict[kpi_dict["kpi_name"].duplicated()]["kpi_name"].tolist()
    raise ValueError(f"Duplicate kpi_name found: {dupes}")

if len(kpi_dict) < 6:
    raise ValueError("KPI dictionary must contain at least 6 rows")

kpi_dict.to_csv("theme3_kpi_dictionary.csv", index=False)
print("✅ Exported theme3_kpi_dictionary.csv")

## 4) Calcul KPI (principal + guardrail)
Hypothèse :
- `enrolled` = inscrit
- `validated` = validation module
- `support_ticket` = ticket support


In [None]:
def safe_nunique(mask):
    if mask.sum() == 0:
        return 0
    return df.loc[mask, "user_id"].nunique()

enrolled_users = safe_nunique(df["event_type"] == "enrolled")
validated_users = safe_nunique(df["event_type"] == "validated")
support_tickets = int((df["event_type"] == "support_ticket").sum())

completion_rate = (validated_users / enrolled_users) if enrolled_users else 0.0
ticket_rate = (support_tickets / enrolled_users) if enrolled_users else 0.0

# Optional: notebook opened within 48h
users_opened_48h = 0
notebook_rate_48h = 0.0
if "notebook_opened" in set(df["event_type"].unique()):
    enroll_times = (
        df[df["event_type"] == "enrolled"]
        .sort_values("event_time")
        .groupby("user_id")["event_time"].first()
    )
    nb_times = (
        df[df["event_type"] == "notebook_opened"]
        .sort_values("event_time")
        .groupby("user_id")["event_time"].first()
    )
    joined = enroll_times.to_frame("enrolled_time").join(nb_times.to_frame("notebook_time"), how="left")
    within_48h = (
        (joined["notebook_time"].notna()) &
        ((joined["notebook_time"] - joined["enrolled_time"]).dt.total_seconds() <= 48 * 3600)
    )
    users_opened_48h = int(within_48h.sum())
    notebook_rate_48h = (users_opened_48h / enrolled_users) if enrolled_users else 0.0

kpi_values = {
  "completion_rate_m1": round(float(completion_rate), 4),
  "support_ticket_rate": round(float(ticket_rate), 4),
  "enrolled_users": int(enrolled_users),
  "validated_users": int(validated_users),
  "users_opened_notebook_48h": int(users_opened_48h),
  "notebook_opened_48h_rate": round(float(notebook_rate_48h), 4)
}

with open("theme3_kpi_values.json", "w", encoding="utf-8") as f:
    json.dump(kpi_values, f, ensure_ascii=False, indent=2)

print("✅ Exported theme3_kpi_values.json")
kpi_values

## 5) Guardrail analysis (Markdown)
Interprétation courte :
- complétion ↑ mais tickets ↑↑ = risque qualité/support
- complétion ↑ et tickets stables = amélioration plus saine


In [None]:
with open("theme3_guardrail_analysis.md", "w", encoding="utf-8") as f:
    f.write("# Theme 3 — Guardrail analysis\n\n")
    f.write(f"- completion_rate_m1: **{kpi_values['completion_rate_m1']}**\n")
    f.write(f"- support_ticket_rate: **{kpi_values['support_ticket_rate']}** tickets/user\n")
    f.write(f"- notebook_opened_48h_rate: **{kpi_values['notebook_opened_48h_rate']}**\n\n")
    f.write("## Interpretation\n")
    f.write("- Le guardrail empêche d'améliorer un KPI en cassant le système.\n")
    f.write("- Vérifie aussi segmentation (pays/device/canal) si disponible.\n")

print("✅ Exported theme3_guardrail_analysis.md")
print("✅ All required exports generated.")