# 04 - Modeling 3 Engines V2

Benchmark principal V2.
- Phase B: coarse configs (6/moteur en mode full),
- Phase C: robustesse multi-seeds sur top configs.


In [1]:
import sys
from pathlib import Path
import pandas as pd

ROOT = Path.cwd()
if not (ROOT / "src").exists():
    ROOT = ROOT.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.v2_pipeline import (
    DEFAULT_V2_DIR,
    ensure_dir,
    load_train_test,
    prepare_feature_sets,
    build_split_registry,
    run_benchmark,
    V2_COARSE_CONFIGS,
)

DATA_DIR = ROOT / "data"
ARTIFACT_V2 = ensure_dir(ROOT / DEFAULT_V2_DIR)

RUN_FULL = False      # True pour overnight 8-12h
QUICK_CFG_PER_ENGINE = 1
QUICK_TOP2_PER_ENGINE = 1


In [2]:
train_raw, test_raw = load_train_test(DATA_DIR)
feature_sets = prepare_feature_sets(train_raw, test_raw, rare_min_count=30, drop_identifiers=True)
bundle = feature_sets["base_v2"]
splits = build_split_registry(train_raw, n_blocks_time=5, n_splits_group=5, group_col="id_client")

if RUN_FULL:
    families = ["two_part_classic", "two_part_tweedie", "direct_tweedie"]
    sev_modes = ["classic", "weighted_tail", "winsorized"]
    tweedie_powers = [1.3, 1.5, 1.7]
    seeds = [42, 2026]
    calibs = ["none", "isotonic"]
    cfg_per_engine = 6
else:
    families = ["two_part_classic", "two_part_tweedie", "direct_tweedie"]
    sev_modes = ["classic", "weighted_tail"]
    tweedie_powers = [1.5]
    seeds = [42]
    calibs = ["none", "isotonic"]
    cfg_per_engine = QUICK_CFG_PER_ENGINE


In [3]:
all_f = []
all_r = []
all_p = []

for engine in ["catboost", "lightgbm", "xgboost"]:
    cfgs = V2_COARSE_CONFIGS[engine][:cfg_per_engine]
    for cfg in cfgs:
        for fam in families:
            for sev_mode in sev_modes:
                if fam == "direct_tweedie" and sev_mode != "classic":
                    continue
                for tw_power in (tweedie_powers if fam == "two_part_tweedie" else [1.5]):
                    for seed in seeds:
                        spec = {
                            "feature_set": "base_v2",
                            "engine": engine,
                            "family": fam,
                            "severity_mode": sev_mode,
                            "tweedie_power": tw_power,
                            "config_id": cfg["config_id"],
                            "calibration_methods": calibs,
                            "use_tail_mapper": fam != "direct_tweedie",
                            "use_target_encoding": True,
                            "target_encode_cols": ["code_postal", "cp3", "modele_vehicule", "marque_modele"],
                            "target_encoding_smoothing": 20.0,
                            "freq_params": cfg["freq_params"],
                            "sev_params": cfg["sev_params"],
                            "direct_params": cfg["direct_params"],
                        }
                        print("[run]", engine, cfg["config_id"], fam, sev_mode, tw_power, "seed", seed)
                        f, r, p = run_benchmark(spec, bundle=bundle, splits=splits, seed=seed)
                        all_f.append(f)
                        all_r.append(r)
                        all_p.append(p)

fold_df = pd.concat(all_f, ignore_index=True)
run_df = pd.concat(all_r, ignore_index=True)
pred_df = pd.concat(all_p, ignore_index=True)

run_df.to_csv(ARTIFACT_V2 / "run_registry_v2.csv", index=False)
pred_df.to_parquet(ARTIFACT_V2 / "oof_predictions_v2.parquet", index=False)
print("saved:", ARTIFACT_V2 / "run_registry_v2.csv")
print("saved:", ARTIFACT_V2 / "oof_predictions_v2.parquet")
run_df.sort_values(["split", "rmse_prime"]).head(50)


[run] catboost cb_v2_c1 two_part_classic classic 1.5 seed 42
[run] catboost cb_v2_c1 two_part_classic weighted_tail 1.5 seed 42
[run] catboost cb_v2_c1 two_part_tweedie classic 1.5 seed 42
[run] catboost cb_v2_c1 two_part_tweedie weighted_tail 1.5 seed 42
[run] catboost cb_v2_c1 direct_tweedie classic 1.5 seed 42
[run] lightgbm lgb_v2_c1 two_part_classic classic 1.5 seed 42
[run] lightgbm lgb_v2_c1 two_part_classic weighted_tail 1.5 seed 42
[run] lightgbm lgb_v2_c1 two_part_tweedie classic 1.5 seed 42
[run] lightgbm lgb_v2_c1 two_part_tweedie weighted_tail 1.5 seed 42
[run] lightgbm lgb_v2_c1 direct_tweedie classic 1.5 seed 42
[run] xgboost xgb_v2_c1 two_part_classic classic 1.5 seed 42
[run] xgboost xgb_v2_c1 two_part_classic weighted_tail 1.5 seed 42
[run] xgboost xgb_v2_c1 two_part_tweedie classic 1.5 seed 42
[run] xgboost xgb_v2_c1 two_part_tweedie weighted_tail 1.5 seed 42
[run] xgboost xgb_v2_c1 direct_tweedie classic 1.5 seed 42
saved: c:\Users\icemo\Downloads\Calcul-prime-d-ass

Unnamed: 0,level,split,feature_set,engine,family,config_id,seed,severity_mode,calibration,tail_mapper,fold_id,n_valid,rmse_prime,auc_freq,brier_freq,rmse_sev_pos,q99_ratio_pos,run_id
17,run,aux_blocked5,base_v2,catboost,two_part_tweedie,cb_v2_c1,42,classic,isotonic,isotonic,-1,50000,542.571435,0.635468,0.054255,1461.456772,0.283293,base_v2|catboost|two_part_tweedie|cb_v2_c1|42|...
16,run,aux_blocked5,base_v2,catboost,two_part_tweedie,cb_v2_c1,42,classic,none,isotonic,-1,50000,542.575544,0.638932,0.05426,1461.456772,0.283293,base_v2|catboost|two_part_tweedie|cb_v2_c1|42|...
11,run,aux_blocked5,base_v2,catboost,two_part_classic,cb_v2_c1,42,weighted_tail,isotonic,isotonic,-1,50000,542.577648,0.635468,0.054255,1462.643524,0.297873,base_v2|catboost|two_part_classic|cb_v2_c1|42|...
10,run,aux_blocked5,base_v2,catboost,two_part_classic,cb_v2_c1,42,weighted_tail,none,isotonic,-1,50000,542.578502,0.638932,0.05426,1462.643524,0.297873,base_v2|catboost|two_part_classic|cb_v2_c1|42|...
23,run,aux_blocked5,base_v2,catboost,two_part_tweedie,cb_v2_c1,42,weighted_tail,isotonic,isotonic,-1,50000,542.595859,0.635468,0.054255,1461.475757,0.298799,base_v2|catboost|two_part_tweedie|cb_v2_c1|42|...
22,run,aux_blocked5,base_v2,catboost,two_part_tweedie,cb_v2_c1,42,weighted_tail,none,isotonic,-1,50000,542.611299,0.638932,0.05426,1461.475757,0.298799,base_v2|catboost|two_part_tweedie|cb_v2_c1|42|...
5,run,aux_blocked5,base_v2,catboost,two_part_classic,cb_v2_c1,42,classic,isotonic,isotonic,-1,50000,542.670668,0.635468,0.054255,1464.672223,0.294504,base_v2|catboost|two_part_classic|cb_v2_c1|42|...
4,run,aux_blocked5,base_v2,catboost,two_part_classic,cb_v2_c1,42,classic,none,isotonic,-1,50000,542.681086,0.638932,0.05426,1464.672223,0.294504,base_v2|catboost|two_part_classic|cb_v2_c1|42|...
77,run,aux_blocked5,base_v2,xgboost,two_part_tweedie,xgb_v2_c1,42,weighted_tail,isotonic,isotonic,-1,50000,542.841891,0.630252,0.054308,1461.418314,0.303062,base_v2|xgboost|two_part_tweedie|xgb_v2_c1|42|...
65,run,aux_blocked5,base_v2,xgboost,two_part_classic,xgb_v2_c1,42,weighted_tail,isotonic,isotonic,-1,50000,542.842109,0.630252,0.054308,1462.489815,0.283711,base_v2|xgboost|two_part_classic|xgb_v2_c1|42|...
