# 02 - Feature engineering V2

Objectifs:
- ablations par blocs de features,
- comparaison `base_v2`, `robust_v2`, `compact_v2`,
- impact sur RMSE et couverture de queue.


In [None]:
import sys
from pathlib import Path
import pandas as pd

ROOT = Path.cwd()
if not (ROOT / "src").exists():
    ROOT = ROOT.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.v2_pipeline import (
    DEFAULT_V2_DIR,
    ensure_dir,
    load_train_test,
    prepare_feature_sets,
    build_split_registry,
    run_benchmark,
    build_prediction_distribution_table,
    V2_COARSE_CONFIGS,
)

DATA_DIR = ROOT / "data"
ARTIFACT_V2 = ensure_dir(ROOT / DEFAULT_V2_DIR)


In [None]:
train_raw, test_raw = load_train_test(DATA_DIR)
feature_sets = prepare_feature_sets(train_raw, test_raw, rare_min_count=30, drop_identifiers=True)
splits = build_split_registry(train_raw, n_blocks_time=5, n_splits_group=5, group_col="id_client")
primary_only = {"primary_time": splits["primary_time"]}

experiments = [
    {"name": "base_with_te", "feature_set": "base_v2", "use_te": True},
    {"name": "base_no_te", "feature_set": "base_v2", "use_te": False},
    {"name": "robust_with_te", "feature_set": "robust_v2", "use_te": True},
    {"name": "compact_with_te", "feature_set": "compact_v2", "use_te": True},
]


In [None]:
rows_run = []
rows_dist = []
cfg = V2_COARSE_CONFIGS["catboost"][0]

for exp in experiments:
    spec = {
        "feature_set": exp["feature_set"],
        "engine": "catboost",
        "family": "two_part_classic",
        "severity_mode": "weighted_tail",
        "tweedie_power": 1.5,
        "config_id": cfg["config_id"],
        "calibration_methods": ["none", "isotonic"],
        "use_tail_mapper": True,
        "use_target_encoding": exp["use_te"],
        "target_encode_cols": ["code_postal", "cp3", "modele_vehicule", "marque_modele"],
        "target_encoding_smoothing": 20.0,
        "freq_params": cfg["freq_params"],
        "sev_params": cfg["sev_params"],
        "direct_params": cfg["direct_params"],
        "split_names": ["primary_time"],
    }
    _, r, p = run_benchmark(spec, bundle=feature_sets[exp["feature_set"]], splits=primary_only, seed=42)
    r["experiment"] = exp["name"]
    rows_run.append(r)
    d = build_prediction_distribution_table(p)
    d["experiment"] = exp["name"]
    rows_dist.append(d)
    print("done", exp["name"], "rows", len(r))

run_cmp = pd.concat(rows_run, ignore_index=True)
dist_cmp = pd.concat(rows_dist, ignore_index=True)

run_cmp.to_csv(ARTIFACT_V2 / "feature_set_comparison_v2.csv", index=False)
display(
    run_cmp[run_cmp["level"] == "run"]
    .sort_values(["rmse_prime", "q99_ratio_pos"], ascending=[True, False])
    .head(20)
)


In [None]:
q = dist_cmp[(dist_cmp["sample"] == "oof") & (dist_cmp["split"] == "primary_time")][
    ["run_id", "experiment", "pred_q90", "pred_q99", "pred_q99_q90_ratio", "distribution_collapse_flag"]
]
view = (
    run_cmp[run_cmp["level"] == "run"][["run_id", "experiment", "rmse_prime", "q99_ratio_pos"]]
    .merge(q, on=["run_id", "experiment"], how="left")
    .sort_values(["rmse_prime", "pred_q99_q90_ratio"], ascending=[True, False])
)
display(view.head(20))
