# 02 — Modeling 3 Engines (CatBoost / LightGBM / XGBoost)

Ce notebook exécute:
- pipeline fréquence + gravité 2-parties,
- variantes gravité (`classic`, `weighted_tail`),
- calibration fréquence (`none`, `isotonic`, `platt`),
- évaluations primaire + secondaire,
- logging artefacts (`run_registry.csv`, `oof_predictions.parquet`).


In [None]:
import sys
from pathlib import Path
import itertools
import json
import numpy as np
import pandas as pd

ROOT = Path.cwd()
if not (ROOT / "src").exists():
    ROOT = ROOT.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.v1_pipeline import (
    COARSE_CONFIGS,
    INDEX_COL,
    ensure_dir,
    load_train_test,
    prepare_datasets,
    run_cv_experiment,
    pick_top_configs,
    save_json,
)

DATA_DIR = ROOT / "data"
ARTIFACT_DIR = ensure_dir(ROOT / "artifacts")

# Runtime controls
RUN_FULL = False      # True => 3 moteurs robustes complets (2-4h+)
QUICK_TOPK_CONFIG = 1 # en mode rapide, ne garde que N config/moteur

SEEDS = [42, 2026] if RUN_FULL else [42]
SEVERITY_MODES = ["classic", "weighted_tail"] if RUN_FULL else ["classic", "weighted_tail"]
CALIBRATION_METHODS = ["none", "isotonic", "platt"] if RUN_FULL else ["none", "isotonic"]


In [None]:
def frame_to_folds(df: pd.DataFrame):
    folds = {}
    for fold_id, g in df.groupby("fold_id"):
        tr = g.loc[g["role"] == "train", "row_idx"].to_numpy(dtype=int)
        va = g.loc[g["role"] == "valid", "row_idx"].to_numpy(dtype=int)
        folds[int(fold_id)] = (tr, va)
    return folds

train_raw, test_raw = load_train_test(DATA_DIR)
bundle = prepare_datasets(train_raw, test_raw, drop_identifiers=True)

folds_primary_df = pd.read_parquet(ARTIFACT_DIR / "folds_primary.parquet")
folds_secondary_df = pd.read_parquet(ARTIFACT_DIR / "folds_secondary.parquet")

folds_primary = frame_to_folds(folds_primary_df)
folds_secondary = frame_to_folds(folds_secondary_df)

splits = {
    "primary_time": folds_primary,
    "secondary_group": folds_secondary,
}

print("Splits loaded:", {k: len(v) for k, v in splits.items()})


In [None]:
# Execution loop
all_fold_metrics = []
all_run_metrics = []
all_pred_frames = []

for split_name, folds in splits.items():
    for engine, cfgs in COARSE_CONFIGS.items():
        engine_cfgs = cfgs if RUN_FULL else cfgs[:QUICK_TOPK_CONFIG]
        for cfg in engine_cfgs:
            for severity_mode in SEVERITY_MODES:
                for seed in SEEDS:
                    print(
                        f"[RUN] split={split_name} engine={engine} cfg={cfg['config_id']} "
                        f"sev={severity_mode} seed={seed}"
                    )

                    fold_df, run_df, pred_df = run_cv_experiment(
                        split_name=split_name,
                        engine=engine,
                        config_id=cfg["config_id"],
                        X=bundle.X_train,
                        y_freq=bundle.y_freq,
                        y_sev=bundle.y_sev,
                        folds=folds,
                        X_test=bundle.X_test,
                        cat_cols=bundle.cat_cols,
                        seed=seed,
                        severity_mode=severity_mode,
                        calibration_methods=CALIBRATION_METHODS,
                        freq_params=cfg["freq_params"],
                        sev_params=cfg["sev_params"],
                    )
                    all_fold_metrics.append(fold_df)
                    all_run_metrics.append(run_df)
                    all_pred_frames.append(pred_df)


In [None]:
fold_metrics = pd.concat(all_fold_metrics, ignore_index=True) if all_fold_metrics else pd.DataFrame()
run_metrics = pd.concat(all_run_metrics, ignore_index=True) if all_run_metrics else pd.DataFrame()
preds = pd.concat(all_pred_frames, ignore_index=True) if all_pred_frames else pd.DataFrame()

registry = pd.concat([fold_metrics, run_metrics], ignore_index=True)
registry.to_csv(ARTIFACT_DIR / "run_registry.csv", index=False)
preds.to_parquet(ARTIFACT_DIR / "oof_predictions.parquet", index=False)

test_preds = preds[preds["is_test"] == 1].copy()
test_preds.to_parquet(ARTIFACT_DIR / "test_predictions.parquet", index=False)

print("Saved:")
print("-", ARTIFACT_DIR / "run_registry.csv")
print("-", ARTIFACT_DIR / "oof_predictions.parquet")
print("-", ARTIFACT_DIR / "test_predictions.parquet")
print("Registry rows:", len(registry), "| Pred rows:", len(preds))


In [None]:
# Sélection top configs par moteur (sur split primaire)
selected_configs = pick_top_configs(
    run_registry=run_metrics,
    split_name="primary_time",
    top_k_per_engine=2 if RUN_FULL else 1,
)
save_json(selected_configs, ARTIFACT_DIR / "selected_configs.json")
selected_configs


In [None]:
# Résumé principal
summary_primary = (
    run_metrics[run_metrics["split"] == "primary_time"]
    .sort_values(["rmse_prime", "brier_freq"])
    .head(20)
)
summary_secondary = (
    run_metrics[run_metrics["split"] == "secondary_group"]
    .sort_values(["rmse_prime", "brier_freq"])
    .head(20)
)

print("Top primary:")
display(summary_primary)
print("Top secondary:")
display(summary_secondary)


## Artefacts produits
- `artifacts/run_registry.csv`
- `artifacts/oof_predictions.parquet`
- `artifacts/test_predictions.parquet`
- `artifacts/selected_configs.json`
