# Thème 3 — KPI Dictionary + KPI values (preuves)

Objectifs :
- Construire un dictionnaire KPI (6 lignes minimum)
- Calculer au moins 2 KPI sur un dataset (principal + guardrail)
- Exporter les preuves :
  - `theme3_kpi_dictionary.csv`
  - `theme3_kpi_values.json`
  - `theme3_guardrail_analysis.md`

Dataset sample : `koryxa_school_events_sample.csv`


In [None]:
import json
import pandas as pd
from pathlib import Path

DATASET_PATH = Path("koryxa_school_events_sample.csv")
df = pd.read_csv(DATASET_PATH)
df.head()

In [None]:
# --- Validate columns ---
required = {
    "user_id",
    "cohort_id",
    "country",
    "module_id",
    "event_type",
    "timestamp",
    "support_ticket_count",
    "validated_m1",
}
missing = required - set(df.columns)
if missing:
    raise ValueError(f"Missing columns: {missing}")

df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True, errors="coerce")
df["support_ticket_count"] = pd.to_numeric(df["support_ticket_count"], errors="coerce").fillna(0).astype(int)
df["validated_m1"] = pd.to_numeric(df["validated_m1"], errors="coerce").fillna(0).astype(int)

df.shape

## 1) Dictionnaire KPI (template)

Champs recommandés (minimum pro) :
- kpi_name, definition, objective, numerator, denominator, formula, unit, granularity, segment, source, owner, refresh, guardrail, gaming_risk, controls


In [None]:
kpi_dict = pd.DataFrame(
    [
        {
            "kpi_name": "completion_rate_m1",
            "definition": "% des inscrits ayant validé le module 1",
            "objective": "Augmenter la complétion du Module 1",
            "numerator": "validated_users",
            "denominator": "enrolled_users",
            "formula": "validated_users / enrolled_users",
            "unit": "%",
            "granularity": "cohorte",
            "segment": "nouveaux inscrits module m1",
            "source": "events (sample)",
            "owner": "KORYXA",
            "refresh": "hebdo",
            "guardrail": "support_ticket_rate",
            "gaming_risk": "complétion artificielle sans apprentissage",
            "controls": "suivre quiz/projet + tickets support",
        },
        {
            "kpi_name": "support_ticket_rate",
            "definition": "tickets support par inscrit",
            "objective": "Garde-fou qualité (support)",
            "numerator": "support_tickets",
            "denominator": "enrolled_users",
            "formula": "support_tickets / enrolled_users",
            "unit": "ratio",
            "granularity": "cohorte",
            "segment": "nouveaux inscrits module m1",
            "source": "events (sample)",
            "owner": "KORYXA",
            "refresh": "hebdo",
            "guardrail": "",
            "gaming_risk": "fermeture tickets sans résolution",
            "controls": "suivre réouverture / satisfaction",
        },
        {
            "kpi_name": "notebook_opened_48h_rate",
            "definition": "% des inscrits ouvrant le notebook sous 48h",
            "objective": "Leading KPI (engagement précoce)",
            "numerator": "users_notebook_48h",
            "denominator": "enrolled_users",
            "formula": "users_notebook_48h / enrolled_users",
            "unit": "%",
            "granularity": "cohorte",
            "segment": "nouveaux inscrits module m1",
            "source": "events (sample)",
            "owner": "KORYXA",
            "refresh": "hebdo",
            "guardrail": "",
            "gaming_risk": "ouverture sans exécution réelle",
            "controls": "suivre exécution notebook + outputs",
        },
        {
            "kpi_name": "theme2_completion_rate",
            "definition": "% des inscrits qui terminent le thème 2",
            "objective": "Localiser une chute majeure",
            "numerator": "users_completed_theme2",
            "denominator": "enrolled_users",
            "formula": "users_completed_theme2 / enrolled_users",
            "unit": "%",
            "granularity": "cohorte",
            "segment": "nouveaux inscrits module m1",
            "source": "events (sample)",
            "owner": "KORYXA",
            "refresh": "hebdo",
            "guardrail": "",
            "gaming_risk": "complétion partielle (un seul step)",
            "controls": "définir 'terminer theme2' précisément",
        },
        {
            "kpi_name": "median_time_to_theme2_hours",
            "definition": "temps médian (heures) entre inscription et 1ère vue thème 2",
            "objective": "Diagnostic friction / démarrage lent",
            "numerator": "median_hours",
            "denominator": "",
            "formula": "median(t_theme2_first_view - t_signup)",
            "unit": "heures",
            "granularity": "cohorte",
            "segment": "nouveaux inscrits module m1",
            "source": "events (sample)",
            "owner": "KORYXA",
            "refresh": "hebdo",
            "guardrail": "",
            "gaming_risk": "timestamps incomplets",
            "controls": "qualité tracking + timezone",
        },
        {
            "kpi_name": "project_submission_rate_m1",
            "definition": "% des inscrits qui soumettent un mini-projet",
            "objective": "Qualité / preuve d'exécution",
            "numerator": "users_project_submitted",
            "denominator": "enrolled_users",
            "formula": "users_project_submitted / enrolled_users",
            "unit": "%",
            "granularity": "cohorte",
            "segment": "nouveaux inscrits module m1",
            "source": "events (sample)",
            "owner": "KORYXA",
            "refresh": "hebdo",
            "guardrail": "",
            "gaming_risk": "soumission vide",
            "controls": "validation contenu + critères",
        },
    ]
)

kpi_dict.head()

In [None]:
# Validate dictionary has required columns
required_cols = {
    "kpi_name",
    "definition",
    "objective",
    "numerator",
    "denominator",
    "formula",
    "unit",
    "granularity",
    "segment",
    "source",
    "owner",
    "refresh",
    "guardrail",
    "gaming_risk",
    "controls",
}
missing_cols = required_cols - set(kpi_dict.columns)
if missing_cols:
    raise ValueError(f"Missing dictionary columns: {missing_cols}")

# Ensure at least 6 rows
if len(kpi_dict) < 6:
    raise ValueError("KPI Dictionary must contain at least 6 rows")

print("✅ KPI Dictionary checks passed")

## 2) KPI calculations (principal + guardrail)

Hypothèse sample :
- `enrolled_users` = nombre d'utilisateurs uniques apparaissant dans le dataset (module m1)
- `validated_users` = utilisateurs avec `validated_m1 == 1`
- `support_tickets` = somme de `support_ticket_count` par utilisateur (sur période)


In [None]:
# Filter Module 1 events
m1 = df[df["module_id"] == "m1"].copy()

users = m1[["user_id", "cohort_id", "country"]].drop_duplicates(subset=["user_id"]).copy()
enrolled_users = users["user_id"].nunique()

# validated_m1 is per-event in sample; take max per user
validated_by_user = m1.groupby("user_id")["validated_m1"].max()
validated_users = int((validated_by_user == 1).sum())

completion_rate_m1 = (validated_users / enrolled_users) if enrolled_users else 0

# support tickets per user (sum), then rate = total tickets / enrolled_users
tickets_by_user = m1.groupby("user_id")["support_ticket_count"].max()
support_tickets = int(tickets_by_user.sum())
support_ticket_rate = (support_tickets / enrolled_users) if enrolled_users else 0

completion_rate_m1, support_ticket_rate, enrolled_users, validated_users, support_tickets

## 3) Exports preuves

- `theme3_kpi_dictionary.csv`
- `theme3_kpi_values.json`
- `theme3_guardrail_analysis.md`


In [None]:
kpi_dict.to_csv("theme3_kpi_dictionary.csv", index=False)

values = {
    "dataset": str(DATASET_PATH),
    "enrolled_users": enrolled_users,
    "validated_users": validated_users,
    "completion_rate_m1": round(float(completion_rate_m1), 4),
    "support_tickets": support_tickets,
    "support_ticket_rate": round(float(support_ticket_rate), 4),
}
with open("theme3_kpi_values.json", "w", encoding="utf-8") as f:
    json.dump(values, f, ensure_ascii=False, indent=2)

analysis = []
analysis.append("# Guardrail analysis (Theme 3)\n\n")
analysis.append(f"- enrolled_users: {enrolled_users}\n")
analysis.append(f"- completion_rate_m1: {values['completion_rate_m1']}\n")
analysis.append(f"- support_ticket_rate: {values['support_ticket_rate']}\n\n")
analysis.append("## Interpretation\n")
analysis.append("- Le guardrail (tickets support) sert à éviter d'améliorer la complétion en créant plus de friction ou de confusion.\n")
analysis.append("- Si la complétion monte mais que le support explose, le gain est probablement artificiel (ou non scalable).\n")
analysis.append("- On doit suivre segmentation (pays/device/canal) et stabilité des définitions KPI.\n")

with open("theme3_guardrail_analysis.md", "w", encoding="utf-8") as f:
    f.write("".join(analysis))

print("✅ Exports generated: theme3_kpi_dictionary.csv, theme3_kpi_values.json, theme3_guardrail_analysis.md")