# 03 - Objective Screening V2

Screening des familles d'objectifs:
- `two_part_classic`,
- `two_part_tweedie`,
- `direct_tweedie`.


In [1]:
import sys
from pathlib import Path
import itertools
import pandas as pd

ROOT = Path.cwd()
if not (ROOT / "src").exists():
    ROOT = ROOT.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.v2_pipeline import (
    DEFAULT_V2_DIR,
    ensure_dir,
    load_train_test,
    prepare_feature_sets,
    build_split_registry,
    run_benchmark,
    V2_COARSE_CONFIGS,
    V2_SCREENING_FAMILIES,
)
DATA_DIR = ROOT / "data"
ARTIFACT_V2 = ensure_dir(ROOT / DEFAULT_V2_DIR)


In [2]:
train_raw, test_raw = load_train_test(DATA_DIR)
feature_sets = prepare_feature_sets(train_raw, test_raw, rare_min_count=30, drop_identifiers=True)
bundle = feature_sets["robust_v2"]
splits = build_split_registry(train_raw, n_blocks_time=5, n_splits_group=5, group_col="id_client")
primary_only = {"primary_time": splits["primary_time"]}

rows_f = []
rows_r = []
rows_p = []
for engine in ["catboost", "lightgbm", "xgboost"]:
    cfg = V2_COARSE_CONFIGS[engine][0]
    for fam in V2_SCREENING_FAMILIES:
        spec = {
            "feature_set": "robust_v2",
            "engine": engine,
            "family": fam["family"],
            "severity_mode": fam["severity_mode"],
            "tweedie_power": fam["tweedie_power"],
            "config_id": cfg["config_id"],
            "calibration_methods": ["none"],
            "use_tail_mapper": fam["family"] != "direct_tweedie",
            "use_target_encoding": True,
            "target_encode_cols": ["code_postal", "cp3", "modele_vehicule", "marque_modele"],
            "target_encoding_smoothing": 20.0,
            "freq_params": cfg["freq_params"],
            "sev_params": cfg["sev_params"],
            "direct_params": cfg["direct_params"],
            "split_names": ["primary_time"],
        }
        print("[screen]", engine, fam)
        f, r, p = run_benchmark(spec, bundle=bundle, splits=primary_only, seed=42)
        rows_f.append(f)
        rows_r.append(r)
        rows_p.append(p)

fold_df = pd.concat(rows_f, ignore_index=True)
run_df = pd.concat(rows_r, ignore_index=True)
pred_df = pd.concat(rows_p, ignore_index=True)
run_df.to_csv(ARTIFACT_V2 / "objective_screening_v2.csv", index=False)
run_df.sort_values("rmse_prime").head(30)


[screen] catboost {'family': 'two_part_classic', 'severity_mode': 'classic', 'tweedie_power': 1.5}
[screen] catboost {'family': 'two_part_classic', 'severity_mode': 'weighted_tail', 'tweedie_power': 1.5}
[screen] catboost {'family': 'two_part_classic', 'severity_mode': 'winsorized', 'tweedie_power': 1.5}
[screen] catboost {'family': 'two_part_tweedie', 'severity_mode': 'classic', 'tweedie_power': 1.3}
[screen] catboost {'family': 'two_part_tweedie', 'severity_mode': 'classic', 'tweedie_power': 1.5}
[screen] catboost {'family': 'two_part_tweedie', 'severity_mode': 'classic', 'tweedie_power': 1.7}
[screen] catboost {'family': 'direct_tweedie', 'severity_mode': 'classic', 'tweedie_power': 1.5}
[screen] lightgbm {'family': 'two_part_classic', 'severity_mode': 'classic', 'tweedie_power': 1.5}
[screen] lightgbm {'family': 'two_part_classic', 'severity_mode': 'weighted_tail', 'tweedie_power': 1.5}
[screen] lightgbm {'family': 'two_part_classic', 'severity_mode': 'winsorized', 'tweedie_power':

Unnamed: 0,level,split,feature_set,engine,family,config_id,seed,severity_mode,calibration,tail_mapper,fold_id,n_valid,rmse_prime,auc_freq,brier_freq,rmse_sev_pos,q99_ratio_pos,run_id
0,run,primary_time,robust_v2,catboost,two_part_classic,cb_v2_c1,42,classic,none,isotonic,-1,40000,543.748216,0.621996,0.054481,1464.488487,0.350881,robust_v2|catboost|two_part_classic|cb_v2_c1|4...
4,run,primary_time,robust_v2,catboost,two_part_tweedie,cb_v2_c1,42,classic,none,isotonic,-1,40000,543.796235,0.621996,0.054481,1466.614598,0.347035,robust_v2|catboost|two_part_tweedie|cb_v2_c1|4...
1,run,primary_time,robust_v2,catboost,two_part_classic,cb_v2_c1,42,weighted_tail,none,isotonic,-1,40000,543.810672,0.621996,0.054481,1471.22476,0.287007,robust_v2|catboost|two_part_classic|cb_v2_c1|4...
5,run,primary_time,robust_v2,catboost,two_part_tweedie,cb_v2_c1,42,classic,none,isotonic,-1,40000,543.824047,0.621996,0.054481,1468.747688,0.318084,robust_v2|catboost|two_part_tweedie|cb_v2_c1|4...
3,run,primary_time,robust_v2,catboost,two_part_tweedie,cb_v2_c1,42,classic,none,isotonic,-1,40000,543.864618,0.621996,0.054481,1465.967806,0.337991,robust_v2|catboost|two_part_tweedie|cb_v2_c1|4...
14,run,primary_time,robust_v2,xgboost,two_part_classic,xgb_v2_c1,42,classic,none,isotonic,-1,40000,543.878743,0.613938,0.054472,1472.157163,0.380448,robust_v2|xgboost|two_part_classic|xgb_v2_c1|4...
18,run,primary_time,robust_v2,xgboost,two_part_tweedie,xgb_v2_c1,42,classic,none,isotonic,-1,40000,543.891308,0.613938,0.054472,1471.031395,0.438936,robust_v2|xgboost|two_part_tweedie|xgb_v2_c1|4...
19,run,primary_time,robust_v2,xgboost,two_part_tweedie,xgb_v2_c1,42,classic,none,isotonic,-1,40000,543.94088,0.613938,0.054472,1470.69974,0.404865,robust_v2|xgboost|two_part_tweedie|xgb_v2_c1|4...
2,run,primary_time,robust_v2,catboost,two_part_classic,cb_v2_c1,42,winsorized,none,isotonic,-1,40000,543.941648,0.621996,0.054481,1479.545025,0.432342,robust_v2|catboost|two_part_classic|cb_v2_c1|4...
15,run,primary_time,robust_v2,xgboost,two_part_classic,xgb_v2_c1,42,weighted_tail,none,isotonic,-1,40000,543.946911,0.613938,0.054472,1472.539802,0.330629,robust_v2|xgboost|two_part_classic|xgb_v2_c1|4...
