# 02 — Modeling 3 Engines (CatBoost / LightGBM / XGBoost)

Ce notebook exécute:
- pipeline fréquence + gravité 2-parties,
- variantes gravité (`classic`, `weighted_tail`),
- calibration fréquence (`none`, `isotonic`, `platt`),
- évaluations primaire + secondaire,
- logging artefacts (`run_registry.csv`, `oof_predictions.parquet`).


In [1]:
import sys
from pathlib import Path
import itertools
import json
import numpy as np
import pandas as pd

ROOT = Path.cwd()
if not (ROOT / "src").exists():
    ROOT = ROOT.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.v1_pipeline import (
    COARSE_CONFIGS,
    INDEX_COL,
    ensure_dir,
    load_train_test,
    prepare_datasets,
    run_cv_experiment,
    pick_top_configs,
    save_json,
)

DATA_DIR = ROOT / "data"
ARTIFACT_DIR = ensure_dir(ROOT / "artifacts")

# Runtime controls
RUN_FULL = False      # True => 3 moteurs robustes complets (2-4h+)
QUICK_TOPK_CONFIG = 1 # en mode rapide, ne garde que N config/moteur

SEEDS = [42, 2026] if RUN_FULL else [42]
SEVERITY_MODES = ["classic", "weighted_tail"] if RUN_FULL else ["classic", "weighted_tail"]
CALIBRATION_METHODS = ["none", "isotonic", "platt"] if RUN_FULL else ["none", "isotonic"]


In [2]:
def frame_to_folds(df: pd.DataFrame):
    folds = {}
    for fold_id, g in df.groupby("fold_id"):
        tr = g.loc[g["role"] == "train", "row_idx"].to_numpy(dtype=int)
        va = g.loc[g["role"] == "valid", "row_idx"].to_numpy(dtype=int)
        folds[int(fold_id)] = (tr, va)
    return folds

train_raw, test_raw = load_train_test(DATA_DIR)
bundle = prepare_datasets(train_raw, test_raw, drop_identifiers=True)

folds_primary_df = pd.read_parquet(ARTIFACT_DIR / "folds_primary.parquet")
folds_secondary_df = pd.read_parquet(ARTIFACT_DIR / "folds_secondary.parquet")

folds_primary = frame_to_folds(folds_primary_df)
folds_secondary = frame_to_folds(folds_secondary_df)

splits = {
    "primary_time": folds_primary,
    "secondary_group": folds_secondary,
}

print("Splits loaded:", {k: len(v) for k, v in splits.items()})


Splits loaded: {'primary_time': 4, 'secondary_group': 5}


In [3]:
# Execution loop
all_fold_metrics = []
all_run_metrics = []
all_pred_frames = []

for split_name, folds in splits.items():
    for engine, cfgs in COARSE_CONFIGS.items():
        engine_cfgs = cfgs if RUN_FULL else cfgs[:QUICK_TOPK_CONFIG]
        for cfg in engine_cfgs:
            for severity_mode in SEVERITY_MODES:
                for seed in SEEDS:
                    print(
                        f"[RUN] split={split_name} engine={engine} cfg={cfg['config_id']} "
                        f"sev={severity_mode} seed={seed}"
                    )

                    fold_df, run_df, pred_df = run_cv_experiment(
                        split_name=split_name,
                        engine=engine,
                        config_id=cfg["config_id"],
                        X=bundle.X_train,
                        y_freq=bundle.y_freq,
                        y_sev=bundle.y_sev,
                        folds=folds,
                        X_test=bundle.X_test,
                        cat_cols=bundle.cat_cols,
                        seed=seed,
                        severity_mode=severity_mode,
                        calibration_methods=CALIBRATION_METHODS,
                        freq_params=cfg["freq_params"],
                        sev_params=cfg["sev_params"],
                    )
                    all_fold_metrics.append(fold_df)
                    all_run_metrics.append(run_df)
                    all_pred_frames.append(pred_df)


[RUN] split=primary_time engine=catboost cfg=cb_c1 sev=classic seed=42
[RUN] split=primary_time engine=catboost cfg=cb_c1 sev=weighted_tail seed=42
[RUN] split=primary_time engine=lightgbm cfg=lgb_c1 sev=classic seed=42
[LightGBM] [Info] Number of positive: 583, number of negative: 9417
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003402 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2994
[LightGBM] [Info] Number of data points in the train set: 10000, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.058300 -> initscore=-2.782085
[LightGBM] [Info] Start training from score -2.782085
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000374 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1748
[LightGBM] [Info] Number of data points in the train set: 583, number of used features: 37
[L

In [4]:
fold_metrics = pd.concat(all_fold_metrics, ignore_index=True) if all_fold_metrics else pd.DataFrame()
run_metrics = pd.concat(all_run_metrics, ignore_index=True) if all_run_metrics else pd.DataFrame()
preds = pd.concat(all_pred_frames, ignore_index=True) if all_pred_frames else pd.DataFrame()

registry = pd.concat([fold_metrics, run_metrics], ignore_index=True)
registry.to_csv(ARTIFACT_DIR / "run_registry.csv", index=False)
preds.to_parquet(ARTIFACT_DIR / "oof_predictions.parquet", index=False)

test_preds = preds[preds["is_test"] == 1].copy()
test_preds.to_parquet(ARTIFACT_DIR / "test_predictions.parquet", index=False)

print("Saved:")
print("-", ARTIFACT_DIR / "run_registry.csv")
print("-", ARTIFACT_DIR / "oof_predictions.parquet")
print("-", ARTIFACT_DIR / "test_predictions.parquet")
print("Registry rows:", len(registry), "| Pred rows:", len(preds))


Saved:
- c:\Users\icemo\Downloads\Calcul-prime-d-assurance\artifacts\run_registry.csv
- c:\Users\icemo\Downloads\Calcul-prime-d-assurance\artifacts\oof_predictions.parquet
- c:\Users\icemo\Downloads\Calcul-prime-d-assurance\artifacts\test_predictions.parquet
Registry rows: 78 | Pred rows: 2400000


In [5]:
# Sélection top configs par moteur (sur split primaire)
selected_configs = pick_top_configs(
    run_registry=run_metrics,
    split_name="primary_time",
    top_k_per_engine=2 if RUN_FULL else 1,
)
save_json(selected_configs, ARTIFACT_DIR / "selected_configs.json")
selected_configs


{'catboost': ['cb_c1'], 'lightgbm': ['lgb_c1'], 'xgboost': ['xgb_c1']}

In [6]:
# Résumé principal
summary_primary = (
    run_metrics[run_metrics["split"] == "primary_time"]
    .sort_values(["rmse_prime", "brier_freq"])
    .head(20)
)
summary_secondary = (
    run_metrics[run_metrics["split"] == "secondary_group"]
    .sort_values(["rmse_prime", "brier_freq"])
    .head(20)
)

print("Top primary:")
display(summary_primary)
print("Top secondary:")
display(summary_secondary)


Top primary:


Unnamed: 0,level,split,engine,config_id,seed,severity_mode,calibration,fold_id,n_valid,rmse_prime,auc_freq,brier_freq,rmse_sev_pos,q99_ratio_pos
0,run,primary_time,catboost,cb_c1,42,classic,none,-1,40000,542.672558,0.650693,0.0541,1476.722719,0.309563
1,run,primary_time,catboost,cb_c1,42,classic,isotonic,-1,40000,542.767388,0.646389,0.054155,1476.722719,0.309563
2,run,primary_time,catboost,cb_c1,42,weighted_tail,none,-1,40000,542.84716,0.650693,0.0541,1497.912776,0.417464
3,run,primary_time,catboost,cb_c1,42,weighted_tail,isotonic,-1,40000,543.14748,0.646389,0.054155,1497.912776,0.417464
9,run,primary_time,xgboost,xgb_c1,42,classic,isotonic,-1,40000,544.766971,0.58479,0.054695,1519.866915,0.405162
11,run,primary_time,xgboost,xgb_c1,42,weighted_tail,isotonic,-1,40000,544.79465,0.58479,0.054695,1522.715918,0.38757
5,run,primary_time,lightgbm,lgb_c1,42,classic,isotonic,-1,40000,545.393075,0.559066,0.055,1531.682148,0.43862
7,run,primary_time,lightgbm,lgb_c1,42,weighted_tail,isotonic,-1,40000,545.562968,0.559066,0.055,1542.243647,0.442006
8,run,primary_time,xgboost,xgb_c1,42,classic,none,-1,40000,551.03533,0.593412,0.057041,1519.866915,0.405162
10,run,primary_time,xgboost,xgb_c1,42,weighted_tail,none,-1,40000,551.267308,0.593412,0.057041,1522.715918,0.38757


Top secondary:


Unnamed: 0,level,split,engine,config_id,seed,severity_mode,calibration,fold_id,n_valid,rmse_prime,auc_freq,brier_freq,rmse_sev_pos,q99_ratio_pos
12,run,secondary_group,catboost,cb_c1,42,classic,none,-1,50000,542.01752,0.655801,0.054028,1471.56785,0.332069
13,run,secondary_group,catboost,cb_c1,42,classic,isotonic,-1,50000,542.029124,0.652661,0.054051,1471.56785,0.332069
14,run,secondary_group,catboost,cb_c1,42,weighted_tail,none,-1,50000,542.319002,0.655801,0.054028,1499.233794,0.430018
15,run,secondary_group,catboost,cb_c1,42,weighted_tail,isotonic,-1,50000,542.381941,0.652661,0.054051,1499.233794,0.430018
23,run,secondary_group,xgboost,xgb_c1,42,weighted_tail,isotonic,-1,50000,543.902329,0.606564,0.054487,1511.203985,0.385313
21,run,secondary_group,xgboost,xgb_c1,42,classic,isotonic,-1,50000,543.905458,0.606564,0.054487,1504.884285,0.376033
17,run,secondary_group,lightgbm,lgb_c1,42,classic,isotonic,-1,50000,544.285919,0.600994,0.054562,1517.57899,0.407671
19,run,secondary_group,lightgbm,lgb_c1,42,weighted_tail,isotonic,-1,50000,544.29244,0.600994,0.054562,1515.514474,0.407818
20,run,secondary_group,xgboost,xgb_c1,42,classic,none,-1,50000,547.920143,0.610147,0.056042,1504.884285,0.376033
22,run,secondary_group,xgboost,xgb_c1,42,weighted_tail,none,-1,50000,547.932015,0.610147,0.056042,1511.203985,0.385313


## Artefacts produits
- `artifacts/run_registry.csv`
- `artifacts/oof_predictions.parquet`
- `artifacts/test_predictions.parquet`
- `artifacts/selected_configs.json`
