# 01 - EDA + CV Design (V2)

Objectifs:
- audit data + drift train/test,
- création feature sets V2,
- création des 3 splits anti-overfitting,
- diagnostic OOD + biais segment v1,
- export des artefacts `artifacts/v2/*`.


In [8]:
import sys
from pathlib import Path
import json
import numpy as np
import pandas as pd

ROOT = Path.cwd()
if not (ROOT / "src").exists():
    ROOT = ROOT.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.v2_pipeline import (
    DEFAULT_V2_DIR,
    INDEX_COL,
    compute_ood_diagnostics,
    compute_segment_bias_from_oof,
    ensure_dir,
    export_split_artifacts_v2,
    load_train_test,
    prepare_feature_sets,
    build_split_registry,
    validate_folds_disjoint,
    validate_group_disjoint,
)

DATA_DIR = ROOT / "data"
ARTIFACT_V2 = ensure_dir(ROOT / DEFAULT_V2_DIR)


In [9]:
train_raw, test_raw = load_train_test(DATA_DIR)
feature_sets = prepare_feature_sets(train_raw, test_raw, rare_min_count=30, drop_identifiers=True)
print("feature sets:", list(feature_sets.keys()))
for k, b in feature_sets.items():
    print(k, "X_train", b.X_train.shape, "X_test", b.X_test.shape, "n_cat", len(b.cat_cols))

bundle = feature_sets["base_v2"]


feature sets: ['base_v2', 'robust_v2', 'compact_v2']
base_v2 X_train (50000, 47) X_test (50000, 47) n_cat 18
robust_v2 X_train (50000, 44) X_test (50000, 44) n_cat 15
compact_v2 X_train (50000, 43) X_test (50000, 43) n_cat 14


In [10]:
splits = build_split_registry(train_raw, n_blocks_time=5, n_splits_group=5, group_col="id_client")
for name, folds in splits.items():
    validate_folds_disjoint(
        folds,
        check_full_coverage=(name in {"secondary_group", "aux_blocked5"}),
        n_rows=len(train_raw),
    )
    if name == "secondary_group":
        validate_group_disjoint(folds, train_raw["id_client"])
    print(name, {k: (len(v[0]), len(v[1])) for k, v in folds.items()})

export_split_artifacts_v2(train=train_raw, splits=splits, output_dir=ARTIFACT_V2)


primary_time {1: (10000, 10000), 2: (20000, 10000), 3: (30000, 10000), 4: (40000, 10000)}
secondary_group {1: (40000, 10000), 2: (40000, 10000), 3: (40000, 10000), 4: (40000, 10000), 5: (40000, 10000)}
aux_blocked5 {1: (40000, 10000), 2: (40000, 10000), 3: (40000, 10000), 4: (40000, 10000), 5: (40000, 10000)}


In [11]:
# Drift/OOD
ood = compute_ood_diagnostics(
    bundle.X_train.join(train_raw[[INDEX_COL]], how="left"),
    bundle.X_test.join(test_raw[[INDEX_COL]], how="left"),
)
ood = ood.sort_values("unseen_test_levels", ascending=False)
display(ood.head(20))


Unnamed: 0,diagnostic_type,feature,train_unique,test_unique,unseen_test_levels,unseen_ratio_on_levels
4,ood,code_postal,210,5214,5004,0.959724
14,ood,marque_modele,200,348,148,0.425287
10,ood,modele_vehicule,201,338,137,0.405325
13,ood,cp3,376,389,13,0.033419
9,ood,marque_vehicule,42,54,12,0.222222
0,ood,type_contrat,4,4,0,0.0
1,ood,freq_paiement,4,4,0,0.0
5,ood,conducteur2,2,2,0,0.0
3,ood,utilisation,4,4,0,0.0
2,ood,paiement,2,2,0,0.0


In [12]:
# Diagnostics biais segment sur OOF v1 (si disponible)
seg = pd.DataFrame()
oof_v1_path = ROOT / "artifacts" / "oof_predictions.parquet"
ens_v1_path = ROOT / "artifacts" / "ensemble_weights_v1.json"

if oof_v1_path.exists():
    oof_v1 = pd.read_parquet(oof_v1_path)
    if ens_v1_path.exists():
        meta = json.loads(ens_v1_path.read_text(encoding="utf-8"))
        run_id = meta.get("best_single_run")
    else:
        oof_v1["run_id"] = (
            oof_v1["engine"].astype(str) + "|"
            + oof_v1["config_id"].astype(str) + "|"
            + oof_v1["seed"].astype(int).astype(str) + "|"
            + oof_v1["severity_mode"].astype(str) + "|"
            + oof_v1["calibration"].astype(str)
        )
        run_id = oof_v1["run_id"].dropna().iloc[0]
    seg = compute_segment_bias_from_oof(train_raw, oof_v1, run_id=run_id, split_name="primary_time")
    print("segment rows:", len(seg))
    display(seg.head(20))
else:
    print("OOF v1 absent -> skip segment bias diagnostics.")


segment rows: 90


Unnamed: 0,diagnostic_type,feature,segment,n,y_mean,p_mean,bias
0,segment_bias,utilisation,Professional,2902,152.040713,116.314863,-35.725851
1,segment_bias,utilisation,Retired,10686,99.697226,92.014331,-7.682895
2,segment_bias,utilisation,WorkPrivate,26373,99.505343,90.985751,-8.519592
3,segment_bias,type_contrat,Maxi,25852,138.048148,121.363003,-16.685145
4,segment_bias,type_contrat,Median1,3734,46.784588,39.248341,-7.536247
5,segment_bias,type_contrat,Median2,7009,43.165533,47.356498,4.190965
6,segment_bias,type_contrat,Mini,3405,27.300291,32.261015,4.960725
7,segment_bias,marque_vehicule,ALFA ROMEO,178,100.75191,121.22965,20.47774
8,segment_bias,marque_vehicule,AUDI,633,124.141501,128.99632,4.854819
9,segment_bias,marque_vehicule,BMW,607,150.374646,118.364039,-32.010607


In [13]:
diag = pd.concat([ood, seg], ignore_index=True, sort=False)
diag.to_parquet(ARTIFACT_V2 / "segment_diagnostics_v2.parquet", index=False)

meta = {
    "n_train": int(len(train_raw)),
    "n_test": int(len(test_raw)),
    "feature_sets": {k: {"n_features": len(v.feature_cols), "n_cat": len(v.cat_cols)} for k, v in feature_sets.items()},
    "splits": {k: sorted([int(x) for x in v.keys()]) for k, v in splits.items()},
}
(ARTIFACT_V2 / "dataset_meta_v2.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
print("saved:", ARTIFACT_V2 / "segment_diagnostics_v2.parquet")
print("saved:", ARTIFACT_V2 / "dataset_meta_v2.json")


saved: c:\Users\icemo\Downloads\Calcul-prime-d-assurance\artifacts\v2\segment_diagnostics_v2.parquet
saved: c:\Users\icemo\Downloads\Calcul-prime-d-assurance\artifacts\v2\dataset_meta_v2.json
