# 02 - Feature Engineering V2

Notebook de comparaison des feature sets (`base_v2`, `robust_v2`, `compact_v2`)
avec un benchmark rapide coh√©rent anti-overfitting.


In [1]:
import sys
from pathlib import Path
import pandas as pd

ROOT = Path.cwd()
if not (ROOT / "src").exists():
    ROOT = ROOT.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.v2_pipeline import (
    DEFAULT_V2_DIR,
    ensure_dir,
    load_train_test,
    prepare_feature_sets,
    build_split_registry,
    run_benchmark,
    V2_COARSE_CONFIGS,
)
DATA_DIR = ROOT / "data"
ARTIFACT_V2 = ensure_dir(ROOT / DEFAULT_V2_DIR)


In [2]:
train_raw, test_raw = load_train_test(DATA_DIR)
feature_sets = prepare_feature_sets(train_raw, test_raw, rare_min_count=30, drop_identifiers=True)
splits = build_split_registry(train_raw, n_blocks_time=5, n_splits_group=5, group_col="id_client")

spec_template = {
    "engine": "catboost",
    "family": "two_part_classic",
    "config_id": "cb_v2_c1",
    "severity_mode": "classic",
    "calibration_methods": ["none", "isotonic"],
    "use_tail_mapper": True,
    "use_target_encoding": True,
    "target_encode_cols": ["code_postal", "cp3", "modele_vehicule", "marque_modele"],
    "target_encoding_smoothing": 20.0,
    "freq_params": V2_COARSE_CONFIGS["catboost"][0]["freq_params"],
    "sev_params": V2_COARSE_CONFIGS["catboost"][0]["sev_params"],
    "direct_params": V2_COARSE_CONFIGS["catboost"][0]["direct_params"],
}


In [3]:
rows = []
for fs_name, bundle in feature_sets.items():
    spec = dict(spec_template)
    spec["feature_set"] = fs_name
    fold_df, run_df, pred_df = run_benchmark(spec, bundle=bundle, splits=splits, seed=42)
    run_df["feature_set"] = fs_name
    rows.append(run_df)
    print(fs_name, "rows", len(run_df))

comp = pd.concat(rows, ignore_index=True)
comp.to_csv(ARTIFACT_V2 / "feature_set_comparison_v2.csv", index=False)
display(
    comp[comp["level"] == "run"]
    .sort_values(["split", "rmse_prime"])
    .head(30)
)


base_v2 rows 6
robust_v2 rows 6
compact_v2 rows 6


Unnamed: 0,level,split,feature_set,engine,family,config_id,seed,severity_mode,calibration,tail_mapper,fold_id,n_valid,rmse_prime,auc_freq,brier_freq,rmse_sev_pos,q99_ratio_pos,run_id
16,run,aux_blocked5,compact_v2,catboost,two_part_classic,cb_v2_c1,42,classic,none,isotonic,-1,50000,542.253302,0.645036,0.054159,1459.179174,0.300804,compact_v2|catboost|two_part_classic|cb_v2_c1|...
10,run,aux_blocked5,robust_v2,catboost,two_part_classic,cb_v2_c1,42,classic,none,isotonic,-1,50000,542.413466,0.644453,0.054169,1462.087731,0.301773,robust_v2|catboost|two_part_classic|cb_v2_c1|4...
11,run,aux_blocked5,robust_v2,catboost,two_part_classic,cb_v2_c1,42,classic,isotonic,isotonic,-1,50000,542.434331,0.641505,0.054174,1462.087731,0.301773,robust_v2|catboost|two_part_classic|cb_v2_c1|4...
17,run,aux_blocked5,compact_v2,catboost,two_part_classic,cb_v2_c1,42,classic,isotonic,isotonic,-1,50000,542.583132,0.641581,0.05426,1459.179174,0.300804,compact_v2|catboost|two_part_classic|cb_v2_c1|...
5,run,aux_blocked5,base_v2,catboost,two_part_classic,cb_v2_c1,42,classic,isotonic,isotonic,-1,50000,542.670668,0.635468,0.054255,1464.672223,0.294504,base_v2|catboost|two_part_classic|cb_v2_c1|42|...
4,run,aux_blocked5,base_v2,catboost,two_part_classic,cb_v2_c1,42,classic,none,isotonic,-1,50000,542.681086,0.638932,0.05426,1464.672223,0.294504,base_v2|catboost|two_part_classic|cb_v2_c1|42|...
7,run,primary_time,robust_v2,catboost,two_part_classic,cb_v2_c1,42,classic,isotonic,isotonic,-1,40000,543.708272,0.617448,0.054455,1464.488487,0.350881,robust_v2|catboost|two_part_classic|cb_v2_c1|4...
6,run,primary_time,robust_v2,catboost,two_part_classic,cb_v2_c1,42,classic,none,isotonic,-1,40000,543.748216,0.621996,0.054481,1464.488487,0.350881,robust_v2|catboost|two_part_classic|cb_v2_c1|4...
13,run,primary_time,compact_v2,catboost,two_part_classic,cb_v2_c1,42,classic,isotonic,isotonic,-1,40000,543.843378,0.617796,0.054461,1469.868856,0.314633,compact_v2|catboost|two_part_classic|cb_v2_c1|...
12,run,primary_time,compact_v2,catboost,two_part_classic,cb_v2_c1,42,classic,none,isotonic,-1,40000,543.891459,0.621358,0.054496,1469.868856,0.314633,compact_v2|catboost|two_part_classic|cb_v2_c1|...
