# 04 - Modeling 3 engines V2.1

Phases:
- A: screening deja fait en notebook 03,
- B: coarse search multi-configs,
- C: finalists robustesse multi-seeds.


In [1]:
import sys
from pathlib import Path
import pandas as pd

ROOT = Path.cwd()
if not (ROOT / "src").exists():
    ROOT = ROOT.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.v2_pipeline import (
    DEFAULT_V2_DIR,
    ensure_dir,
    load_train_test,
    prepare_feature_sets,
    build_split_registry,
    run_benchmark,
    build_prediction_distribution_table,
    V2_COARSE_CONFIGS,
)

DATA_DIR = ROOT / "data"
ARTIFACT_V2 = ensure_dir(ROOT / DEFAULT_V2_DIR)

#RUN_FULL = False  # set True for overnight 10-14h
RUN_FULL = True
QUICK_CFG_PER_ENGINE = 3


In [2]:
train_raw, test_raw = load_train_test(DATA_DIR)
feature_sets = prepare_feature_sets(train_raw, test_raw, rare_min_count=30, drop_identifiers=True)
splits = build_split_registry(train_raw, n_blocks_time=5, n_splits_group=5, group_col="id_client")

if RUN_FULL:
    cfg_per_engine = 6
    seeds = [42, 2026]
    feature_set_list = ["base_v2", "robust_v2", "compact_v2"]
    families = ["two_part_classic", "two_part_tweedie", "direct_tweedie"]
    sev_modes = ["classic", "weighted_tail", "winsorized"]
    tweedie_powers = [1.3, 1.5, 1.7]
    calibrations = ["none", "isotonic"]
else:
    cfg_per_engine = QUICK_CFG_PER_ENGINE
    seeds = [42]
    feature_set_list = ["base_v2", "robust_v2", "compact_v2"]
    families = ["two_part_classic", "two_part_tweedie", "direct_tweedie"]
    sev_modes = ["classic", "weighted_tail"]
    tweedie_powers = [1.5]
    calibrations = ["none", "isotonic"]


In [3]:
all_f, all_r, all_p = [], [], []

for engine in ["catboost", "lightgbm", "xgboost"]:
    cfgs = V2_COARSE_CONFIGS[engine][:cfg_per_engine]
    for cfg in cfgs:
        for fam in families:
            for sev_mode in sev_modes:
                if fam == "direct_tweedie" and sev_mode != "classic":
                    continue
                powers = tweedie_powers if fam == "two_part_tweedie" else [1.5]
                for tw_power in powers:
                    for seed in seeds:
                        spec = {
                            "feature_sets": feature_set_list,
                            "engine": engine,
                            "family": fam,
                            "severity_mode": sev_mode,
                            "tweedie_power": tw_power,
                            "config_id": cfg["config_id"],
                            "calibration_methods": calibrations,
                            "use_tail_mapper": fam != "direct_tweedie",
                            "use_target_encoding": True,
                            "target_encode_cols": ["code_postal", "cp3", "modele_vehicule", "marque_modele"],
                            "target_encoding_smoothing": 20.0,
                            "freq_params": cfg["freq_params"],
                            "sev_params": cfg["sev_params"],
                            "direct_params": cfg["direct_params"],
                        }
                        print("[run]", engine, cfg["config_id"], fam, sev_mode, tw_power, "seed", seed)
                        f, r, p = run_benchmark(spec, bundle=feature_sets, splits=splits, seed=seed)
                        all_f.append(f)
                        all_r.append(r)
                        all_p.append(p)

fold_df = pd.concat(all_f, ignore_index=True)
run_df = pd.concat(all_r, ignore_index=True)
pred_df = pd.concat(all_p, ignore_index=True)
dist_df = build_prediction_distribution_table(pred_df)

run_df.to_csv(ARTIFACT_V2 / "run_registry_v2.csv", index=False)
pred_df.to_parquet(ARTIFACT_V2 / "oof_predictions_v2.parquet", index=False)
dist_df.to_csv(ARTIFACT_V2 / "pred_distribution_audit_v2.csv", index=False)

print("saved:", ARTIFACT_V2 / "run_registry_v2.csv")
print("saved:", ARTIFACT_V2 / "oof_predictions_v2.parquet")
print("saved:", ARTIFACT_V2 / "pred_distribution_audit_v2.csv")

base_view = run_df[run_df["level"] == "run"].copy()
sort_cols = [c for c in ["split", "rmse_prime", "selection_score"] if c in base_view.columns]
display(base_view.sort_values(sort_cols).head(40))


[run] catboost cb_v2_c1 two_part_classic classic 1.5 seed 42
[run] catboost cb_v2_c1 two_part_classic classic 1.5 seed 2026
[run] catboost cb_v2_c1 two_part_classic weighted_tail 1.5 seed 42
[run] catboost cb_v2_c1 two_part_classic weighted_tail 1.5 seed 2026
[run] catboost cb_v2_c1 two_part_classic winsorized 1.5 seed 42
[run] catboost cb_v2_c1 two_part_classic winsorized 1.5 seed 2026
[run] catboost cb_v2_c1 two_part_tweedie classic 1.3 seed 42
[run] catboost cb_v2_c1 two_part_tweedie classic 1.3 seed 2026
[run] catboost cb_v2_c1 two_part_tweedie classic 1.5 seed 42
[run] catboost cb_v2_c1 two_part_tweedie classic 1.5 seed 2026
[run] catboost cb_v2_c1 two_part_tweedie classic 1.7 seed 42
[run] catboost cb_v2_c1 two_part_tweedie classic 1.7 seed 2026
[run] catboost cb_v2_c1 two_part_tweedie weighted_tail 1.3 seed 42
[run] catboost cb_v2_c1 two_part_tweedie weighted_tail 1.3 seed 2026
[run] catboost cb_v2_c1 two_part_tweedie weighted_tail 1.5 seed 42
[run] catboost cb_v2_c1 two_part_tw

KeyboardInterrupt: 