# 04 — Modeling (3 engines) — v2 **Optimized**

This notebook is a *safer* version of `04_modeling_3_engines_v2.ipynb`.

Key changes:
- **Avoids storing predictions for every run** during search (massive memory/time sink).
- Uses a **2-stage search**: coarse → shortlist → full evaluation.
- Writes results **incrementally to disk** so you can stop/restart without losing progress.

> Tip: if you also patch `src/v2_pipeline.py` to support `collect_predictions=False`, you’ll get the biggest speedup.


In [1]:
import sys, inspect
from pathlib import Path
import pandas as pd

ROOT = Path.cwd()
if not (ROOT / "src").exists():
    ROOT = ROOT.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.v2_pipeline import (
    DEFAULT_V2_DIR,
    ensure_dir,
    load_train_test,
    prepare_feature_sets,
    build_split_registry,
    run_benchmark,
    select_final_models,
    V2_COARSE_CONFIGS,
)

DATA_DIR = ROOT / "data"
ARTIFACT_V2 = ensure_dir(ROOT / DEFAULT_V2_DIR)

# --- helper: backward compatible call (works even if pipeline isn't patched yet) ---
_SIG = inspect.signature(run_benchmark)
_HAS_COLLECT = "collect_predictions" in _SIG.parameters

def run_benchmark_safe(spec, *, bundle, splits, seed, collect_predictions=True):
    if _HAS_COLLECT:
        return run_benchmark(spec, bundle=bundle, splits=splits, seed=seed, collect_predictions=collect_predictions)
    # fallback: pipeline not patched -> still returns pred_df (can be huge)
    f, r, p = run_benchmark(spec, bundle=bundle, splits=splits, seed=seed)
    if not collect_predictions:
        p = p.iloc[0:0].copy()
    return f, r, p

print("collect_predictions supported by pipeline:", _HAS_COLLECT)


collect_predictions supported by pipeline: False


In [2]:
# Load data + build feature sets + split registry (same as original)
train_raw, test_raw = load_train_test(DATA_DIR)

feature_sets = prepare_feature_sets(
    train_raw,
    test_raw,
    rare_min_count=30,
    drop_identifiers=True,
)

splits = build_split_registry(train_raw, n_blocks_time=5, n_splits_group=5, group_col="id_client")

# For speed in stage 1 we only use primary_time
splits_stage1 = {"primary_time": splits["primary_time"]}

print("feature sets:", list(feature_sets.keys()))
print("splits:", list(splits.keys()))


feature sets: ['base_v2', 'robust_v2', 'compact_v2']
splits: ['primary_time', 'secondary_group', 'aux_blocked5']


In [3]:
# ----------------------------
# Stage 1 — COARSE SEARCH (fast)
# ----------------------------
# Goal: rank configs quickly.
# - 1 split only (primary_time)
# - 1 seed
# - 1 feature set (base_v2)
# - no isotonic calibration
# - no prediction collection

OUT_STAGE1 = ARTIFACT_V2 / "run_registry_stage1.csv"
DONE = set()
if OUT_STAGE1.exists():
    tmp = pd.read_csv(OUT_STAGE1)
    if "run_id" in tmp.columns:
        DONE = set(tmp["run_id"].astype(str).unique())
print("stage1 already done run_ids:", len(DONE))

cfg_per_engine = 3      # keep small
seeds = [42]
feature_set_list = ["base_v2"]
families = ["two_part_classic", "two_part_tweedie"]   # skip direct for coarse
sev_modes = ["classic", "weighted_tail"]
tweedie_powers = [1.3, 1.5]
calibrations = ["none"]

n_new = 0
for engine in ["catboost", "lightgbm", "xgboost"]:
    cfgs = V2_COARSE_CONFIGS[engine][:cfg_per_engine]
    for cfg in cfgs:
        for fam in families:
            for sev_mode in sev_modes:
                powers = tweedie_powers if fam == "two_part_tweedie" else [1.5]
                for tw_power in powers:
                    for seed in seeds:
                        spec = {
                            "feature_sets": feature_set_list,
                            "engine": engine,
                            "family": fam,
                            "severity_mode": sev_mode,
                            "tweedie_power": tw_power,
                            "config_id": cfg["config_id"],
                            "calibration_methods": calibrations,
                            "use_tail_mapper": True,
                            "use_target_encoding": False,  # turn on only later
                            "freq_params": cfg["freq_params"],
                            "sev_params": cfg["sev_params"],
                            "direct_params": cfg["direct_params"],
                            "split_names": ["primary_time"],
                        }

                        f_df, r_df, _ = run_benchmark_safe(
                            spec, bundle=feature_sets, splits=splits_stage1, seed=seed, collect_predictions=False
                        )
                        if r_df.empty:
                            continue
                        # dedupe / resume
                        new_ids = [rid for rid in r_df["run_id"].astype(str).unique() if rid not in DONE]
                        if not new_ids:
                            continue

                        # append to disk
                        header = not OUT_STAGE1.exists()
                        r_df.to_csv(OUT_STAGE1, mode="a", header=header, index=False)
                        DONE.update(new_ids)
                        n_new += 1
                        print("[stage1 saved]", engine, cfg["config_id"], fam, sev_mode, tw_power, "seed", seed)

print("stage1 newly saved batches:", n_new)
print("stage1 output:", OUT_STAGE1)


stage1 already done run_ids: 0
[stage1 saved] catboost cb_v2_c1 two_part_classic classic 1.5 seed 42
[stage1 saved] catboost cb_v2_c1 two_part_classic weighted_tail 1.5 seed 42
[stage1 saved] catboost cb_v2_c1 two_part_tweedie classic 1.3 seed 42
[stage1 saved] catboost cb_v2_c1 two_part_tweedie classic 1.5 seed 42
[stage1 saved] catboost cb_v2_c1 two_part_tweedie weighted_tail 1.3 seed 42
[stage1 saved] catboost cb_v2_c1 two_part_tweedie weighted_tail 1.5 seed 42
[stage1 saved] catboost cb_v2_c2 two_part_classic classic 1.5 seed 42
[stage1 saved] catboost cb_v2_c2 two_part_classic weighted_tail 1.5 seed 42
[stage1 saved] catboost cb_v2_c2 two_part_tweedie classic 1.3 seed 42
[stage1 saved] catboost cb_v2_c2 two_part_tweedie classic 1.5 seed 42
[stage1 saved] catboost cb_v2_c2 two_part_tweedie weighted_tail 1.3 seed 42
[stage1 saved] catboost cb_v2_c2 two_part_tweedie weighted_tail 1.5 seed 42
[stage1 saved] catboost cb_v2_c3 two_part_classic classic 1.5 seed 42
[stage1 saved] catboost

In [4]:
# Stage 1 — quick view / shortlist
stage1 = pd.read_csv(OUT_STAGE1)
view = stage1[(stage1["level"]=="run") & (stage1["split"]=="primary_time")].copy()
view = view.sort_values("rmse_prime", ascending=True)

# shortlist: top K per engine
K_PER_ENGINE = 6
short = view.groupby("engine", as_index=False).head(K_PER_ENGINE)
cols = ["engine","config_id","family","severity_mode","tweedie_power","rmse_prime"]
display(short[cols].reset_index(drop=True))


Unnamed: 0,engine,config_id,family,severity_mode,tweedie_power,rmse_prime
0,lightgbm,lgb_v2_c2,two_part_tweedie,weighted_tail,1.3,552.671646
1,lightgbm,lgb_v2_c3,two_part_tweedie,weighted_tail,1.5,552.81864
2,lightgbm,lgb_v2_c2,two_part_classic,weighted_tail,1.5,553.011233
3,lightgbm,lgb_v2_c3,two_part_tweedie,weighted_tail,1.3,553.112066
4,lightgbm,lgb_v2_c2,two_part_tweedie,classic,1.5,554.063784
5,lightgbm,lgb_v2_c2,two_part_tweedie,classic,1.3,554.075892
6,catboost,cb_v2_c2,two_part_classic,weighted_tail,1.5,555.02376
7,catboost,cb_v2_c3,two_part_tweedie,classic,1.3,555.049988
8,catboost,cb_v2_c1,two_part_tweedie,classic,1.3,555.621613
9,xgboost,xgb_v2_c1,two_part_classic,weighted_tail,1.5,555.919263


In [6]:
# ----------------------------
# Stage 2 — FULL EVAL (shortlist)
# ----------------------------
# Evaluate shortlisted candidates across:
# - all 3 splits
# - all feature sets
# - 2 seeds
# - isotonic calibration enabled
# Still: NO prediction collection (keeps it light).

OUT_STAGE2 = ARTIFACT_V2 / "run_registry_stage2.csv"
DONE2 = set()
if OUT_STAGE2.exists():
    tmp = pd.read_csv(OUT_STAGE2)
    if "run_id" in tmp.columns:
        DONE2 = set(tmp["run_id"].astype(str).unique())
print("stage2 already done run_ids:", len(DONE2))

seeds2 = [42, 2026]
feature_set_list2 = ["base_v2", "robust_v2", "compact_v2"]
calibrations2 = ["none", "isotonic"]

# candidates from stage1 shortlist
cands = short[["engine","config_id","family","severity_mode","tweedie_power"]].drop_duplicates()

n_new = 0
for row in cands.itertuples(index=False):
    engine, config_id, fam, sev_mode, tw_power = row
    # find params for this config
    cfg = next(c for c in V2_COARSE_CONFIGS[engine] if c["config_id"] == config_id)
    for seed in seeds2:
        spec = {
            "feature_sets": feature_set_list2,
            "engine": engine,
            "family": fam,
            "severity_mode": sev_mode,
            "tweedie_power": float(tw_power),
            "config_id": config_id,
            "calibration_methods": calibrations2,
            "use_tail_mapper": fam != "direct_tweedie",
            "use_target_encoding": True,  # turn on here if you want the true final behaviour
            "target_encode_cols": ["code_postal", "cp3", "modele_vehicule", "marque_modele"],
            "target_encoding_smoothing": 20.0,
            "freq_params": cfg["freq_params"],
            "sev_params": cfg["sev_params"],
            "direct_params": cfg["direct_params"],
            "split_names": list(splits.keys()),
        }

        f_df, r_df, _ = run_benchmark_safe(spec, bundle=feature_sets, splits=splits, seed=seed, collect_predictions=False)
        if r_df.empty:
            continue

        new_ids = [rid for rid in r_df["run_id"].astype(str).unique() if rid not in DONE2]
        if not new_ids:
            continue
        header = not OUT_STAGE2.exists()
        r_df.to_csv(OUT_STAGE2, mode="a", header=header, index=False)
        DONE2.update(new_ids)
        n_new += 1
        print("[stage2 saved]", engine, config_id, fam, sev_mode, tw_power, "seed", seed)

print("stage2 newly saved batches:", n_new)
print("stage2 output:", OUT_STAGE2)


stage2 already done run_ids: 90
[stage2 saved] catboost cb_v2_c3 two_part_tweedie classic 1.3 seed 2026
[stage2 saved] catboost cb_v2_c1 two_part_tweedie classic 1.3 seed 42
[stage2 saved] catboost cb_v2_c1 two_part_tweedie classic 1.3 seed 2026
[stage2 saved] xgboost xgb_v2_c1 two_part_classic weighted_tail 1.5 seed 42
[stage2 saved] xgboost xgb_v2_c1 two_part_classic weighted_tail 1.5 seed 2026
[stage2 saved] catboost cb_v2_c3 two_part_classic weighted_tail 1.5 seed 42
[stage2 saved] catboost cb_v2_c3 two_part_classic weighted_tail 1.5 seed 2026
[stage2 saved] catboost cb_v2_c2 two_part_classic classic 1.5 seed 42
[stage2 saved] catboost cb_v2_c2 two_part_classic classic 1.5 seed 2026
[stage2 saved] xgboost xgb_v2_c2 two_part_tweedie weighted_tail 1.3 seed 42
[stage2 saved] xgboost xgb_v2_c2 two_part_tweedie weighted_tail 1.3 seed 2026
[stage2 saved] catboost cb_v2_c1 two_part_classic classic 1.5 seed 42
[stage2 saved] catboost cb_v2_c1 two_part_classic classic 1.5 seed 2026
[stage2 

In [7]:
# Stage 2 — select final models (stability policy)
stage2 = pd.read_csv(OUT_STAGE2)
selected = select_final_models(stage2, risk_policy="stability_private")

cols = ["rank","accepted","selection_score","engine","config_id","family","severity_mode","tweedie_power","seed"]
display(selected[cols].head(12))


Unnamed: 0,rank,accepted,selection_score,engine,config_id,family,severity_mode,tweedie_power,seed
0,1,True,552.785281,xgboost,xgb_v2_c2,two_part_tweedie,weighted_tail,1.3,2026
1,2,True,553.036368,xgboost,xgb_v2_c2,two_part_tweedie,weighted_tail,1.3,42
2,3,True,553.062312,xgboost,xgb_v2_c2,two_part_classic,weighted_tail,1.5,2026
3,4,True,553.070789,xgboost,xgb_v2_c2,two_part_tweedie,weighted_tail,1.3,2026
4,5,True,553.122664,xgboost,xgb_v2_c2,two_part_tweedie,weighted_tail,1.3,42
5,6,True,553.316018,xgboost,xgb_v2_c2,two_part_classic,weighted_tail,1.5,2026


In [8]:
# (Optional) Stage 3 — re-run ONLY the selected models WITH predictions (for submission / audits)
# Warning: collect_predictions=True can produce large files, but now it's only for a few models.

OUT_PRED = ARTIFACT_V2 / "oof_predictions_selected_v2.parquet"

TOP_N = 6
top = selected.head(TOP_N).copy()
all_preds = []

for row in top.itertuples(index=False):
    engine = row.engine
    config_id = row.config_id
    fam = row.family
    sev_mode = row.severity_mode
    tw_power = float(row.tweedie_power)
    seed = int(row.seed)
    cfg = next(c for c in V2_COARSE_CONFIGS[engine] if c["config_id"] == config_id)

    spec = {
        "feature_sets": ["base_v2","robust_v2","compact_v2"],
        "engine": engine,
        "family": fam,
        "severity_mode": sev_mode,
        "tweedie_power": tw_power,
        "config_id": config_id,
        "calibration_methods": ["none","isotonic"],
        "use_tail_mapper": fam != "direct_tweedie",
        "use_target_encoding": True,
        "target_encode_cols": ["code_postal", "cp3", "modele_vehicule", "marque_modele"],
        "target_encoding_smoothing": 20.0,
        "freq_params": cfg["freq_params"],
        "sev_params": cfg["sev_params"],
        "direct_params": cfg["direct_params"],
        "split_names": list(splits.keys()),
    }

    _, r_df, p_df = run_benchmark_safe(spec, bundle=feature_sets, splits=splits, seed=seed, collect_predictions=True)
    all_preds.append(p_df)
    print("[pred saved]", engine, config_id, fam, sev_mode, tw_power, "seed", seed)

pred_df = pd.concat(all_preds, ignore_index=True) if all_preds else pd.DataFrame()
pred_df.to_parquet(OUT_PRED, index=False)
print("saved:", OUT_PRED, "rows:", len(pred_df))


[pred saved] xgboost xgb_v2_c2 two_part_tweedie weighted_tail 1.3 seed 2026
[pred saved] xgboost xgb_v2_c2 two_part_tweedie weighted_tail 1.3 seed 42
[pred saved] xgboost xgb_v2_c2 two_part_classic weighted_tail 1.5 seed 2026
[pred saved] xgboost xgb_v2_c2 two_part_tweedie weighted_tail 1.3 seed 2026
[pred saved] xgboost xgb_v2_c2 two_part_tweedie weighted_tail 1.3 seed 42
[pred saved] xgboost xgb_v2_c2 two_part_classic weighted_tail 1.5 seed 2026
saved: c:\Users\icemo\Downloads\Calcul-prime-d-assurance\artifacts\v2\oof_predictions_selected_v2.parquet rows: 10800000
