# 01 - EDA + CV design (V2.1)

Objectifs:
- verifier le data contract train/test,
- auditer NA, zeros techniques, distribution cible, extremes,
- diagnostiquer le drift/OOD (categoriel + numerique),
- valider les 3 splits anti-overfitting,
- exporter les diagnostics versionnes.


In [None]:
import sys
import json
from pathlib import Path
import numpy as np
import pandas as pd

ROOT = Path.cwd()
if not (ROOT / "src").exists():
    ROOT = ROOT.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.v2_pipeline import (
    DEFAULT_V2_DIR,
    INDEX_COL,
    TARGET_SEV_COL,
    ensure_dir,
    load_train_test,
    prepare_feature_sets,
    build_split_registry,
    validate_folds_disjoint,
    validate_group_disjoint,
    export_split_artifacts_v2,
    compute_ood_diagnostics,
    compute_segment_bias_from_oof,
)

DATA_DIR = ROOT / "data"
ARTIFACT_V2 = ensure_dir(ROOT / DEFAULT_V2_DIR)


In [None]:
train_raw, test_raw = load_train_test(DATA_DIR)
feature_sets = prepare_feature_sets(train_raw, test_raw, rare_min_count=30, drop_identifiers=True)
bundle = feature_sets["base_v2"]

print("train shape:", train_raw.shape, "test shape:", test_raw.shape)
print("feature sets:", list(feature_sets.keys()))
for fs_name, b in feature_sets.items():
    print(fs_name, "features", len(b.feature_cols), "cat", len(b.cat_cols), "num", len(b.num_cols))


## Data contract


In [None]:
train_cols = set(train_raw.columns)
test_cols = set(test_raw.columns)
common_cols = sorted(train_cols.intersection(test_cols))
train_only = sorted(train_cols - test_cols)
test_only = sorted(test_cols - train_cols)

contract = pd.DataFrame(
    [
        {"item": "train_rows", "value": int(len(train_raw))},
        {"item": "test_rows", "value": int(len(test_raw))},
        {"item": "common_columns", "value": int(len(common_cols))},
        {"item": "train_only_columns", "value": int(len(train_only))},
        {"item": "test_only_columns", "value": int(len(test_only))},
    ]
)
display(contract)
print("train_only:", train_only)
print("test_only:", test_only)


## Target, missing values and technical zeros


In [None]:
y = train_raw[TARGET_SEV_COL].astype(float)
y_pos = y[y > 0]
target_stats = pd.DataFrame(
    [
        {"metric": "n_train", "value": int(len(y))},
        {"metric": "claim_rate", "value": float((y > 0).mean())},
        {"metric": "mean_positive", "value": float(y_pos.mean()) if len(y_pos) else np.nan},
        {"metric": "q95_positive", "value": float(y_pos.quantile(0.95)) if len(y_pos) else np.nan},
        {"metric": "q99_positive", "value": float(y_pos.quantile(0.99)) if len(y_pos) else np.nan},
        {"metric": "max_positive", "value": float(y_pos.max()) if len(y_pos) else np.nan},
    ]
)
display(target_stats)

na_train = train_raw.isna().mean().sort_values(ascending=False).rename("na_ratio_train")
na_test = test_raw.isna().mean().sort_values(ascending=False).rename("na_ratio_test")
na_table = pd.concat([na_train, na_test], axis=1).fillna(0.0).reset_index().rename(columns={"index": "feature"})
display(na_table.head(20))

zero_cols = [c for c in ["poids_vehicule", "cylindre_vehicule"] if c in train_raw.columns]
zero_rows = []
for c in zero_cols:
    zero_rows.append(
        {
            "feature": c,
            "zero_ratio_train": float((train_raw[c] == 0).mean()),
            "zero_ratio_test": float((test_raw[c] == 0).mean()),
        }
    )
display(pd.DataFrame(zero_rows))


## Splits anti-overfitting


In [None]:
splits = build_split_registry(train_raw, n_blocks_time=5, n_splits_group=5, group_col="id_client")
for split_name, folds in splits.items():
    validate_folds_disjoint(
        folds,
        check_full_coverage=(split_name in {"secondary_group", "aux_blocked5"}),
        n_rows=len(train_raw),
    )
    if split_name == "secondary_group":
        validate_group_disjoint(folds, train_raw["id_client"])
    print(split_name, {k: (len(v[0]), len(v[1])) for k, v in folds.items()})

export_split_artifacts_v2(train=train_raw, splits=splits, output_dir=ARTIFACT_V2)
print("saved fold artifacts under", ARTIFACT_V2)


## Drift and OOD diagnostics


In [None]:
ood = compute_ood_diagnostics(bundle.X_train, bundle.X_test)
ood_focus = ood[ood["feature"].isin(["code_postal", "cp3", "cp2", "modele_vehicule", "marque_vehicule", "marque_modele"])]
display(ood_focus.sort_values("unseen_test_levels", ascending=False))

numeric_cols = [c for c in bundle.num_cols if c in bundle.X_test.columns]
drift_rows = []
for c in numeric_cols:
    tr = pd.to_numeric(bundle.X_train[c], errors="coerce")
    te = pd.to_numeric(bundle.X_test[c], errors="coerce")
    m_tr = float(np.nanmean(tr))
    m_te = float(np.nanmean(te))
    s_tr = float(np.nanstd(tr))
    drift_rows.append(
        {
            "diagnostic_type": "numeric_drift",
            "feature": c,
            "mean_train": m_tr,
            "mean_test": m_te,
            "std_train": s_tr,
            "std_shift": float((m_te - m_tr) / max(s_tr, 1e-9)),
        }
    )
drift_df = pd.DataFrame(drift_rows).sort_values("std_shift", key=lambda s: s.abs(), ascending=False)
display(drift_df.head(20))


## Segment bias from V1 OOF (if available)


In [None]:
seg = pd.DataFrame()
oof_v1_path = ROOT / "artifacts" / "oof_predictions.parquet"
ens_v1_path = ROOT / "artifacts" / "ensemble_weights_v1.json"

if oof_v1_path.exists():
    oof_v1 = pd.read_parquet(oof_v1_path)
    if "run_id" not in oof_v1.columns:
        oof_v1["run_id"] = (
            oof_v1["engine"].astype(str) + "|"
            + oof_v1["config_id"].astype(str) + "|"
            + oof_v1["seed"].astype(int).astype(str) + "|"
            + oof_v1["severity_mode"].astype(str) + "|"
            + oof_v1["calibration"].astype(str)
        )
    if ens_v1_path.exists():
        meta = json.loads(ens_v1_path.read_text(encoding="utf-8"))
        run_id = meta.get("best_single_run", oof_v1["run_id"].iloc[0])
    else:
        run_id = oof_v1["run_id"].iloc[0]
    seg = compute_segment_bias_from_oof(train_raw, oof_v1, run_id=run_id, split_name="primary_time")
    print("segment diagnostics rows:", len(seg))
    display(seg.head(20))
else:
    print("OOF v1 not found, skip segment bias diagnostic.")


## Risks and mitigation


In [None]:
risk_table = pd.DataFrame(
    [
        {"risk": "Fine-grain categorical OOD", "impact": "Public/private shake-up", "mitigation": "hierarchy cp2/cp3, robust feature sets"},
        {"risk": "Tail under-dispersion in severity", "impact": "RMSE degradation", "mitigation": "safe tail mapper + distribution audit"},
        {"risk": "Client leakage", "impact": "over-optimistic CV", "mitigation": "secondary GroupKFold(id_client)"},
        {"risk": "Single split selection bias", "impact": "unstable ranking", "mitigation": "primary+secondary+aux weighted selection"},
    ]
)
display(risk_table)


In [None]:
diags = []
if not ood.empty:
    diags.append(ood.assign(diagnostic_type="ood"))
if "drift_df" in globals() and not drift_df.empty:
    diags.append(drift_df)
if not seg.empty:
    diags.append(seg)
diag_all = pd.concat(diags, ignore_index=True, sort=False) if diags else pd.DataFrame()
diag_all.to_parquet(ARTIFACT_V2 / "segment_diagnostics_v2.parquet", index=False)

meta = {
    "n_train": int(len(train_raw)),
    "n_test": int(len(test_raw)),
    "feature_sets": {k: {"n_features": len(v.feature_cols), "n_cat": len(v.cat_cols)} for k, v in feature_sets.items()},
    "splits": {k: sorted([int(fid) for fid in v.keys()]) for k, v in splits.items()},
    "train_only_cols": train_only,
    "test_only_cols": test_only,
}
(ARTIFACT_V2 / "dataset_meta_v2.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
print("saved:", ARTIFACT_V2 / "segment_diagnostics_v2.parquet")
print("saved:", ARTIFACT_V2 / "dataset_meta_v2.json")
