# 01 — EDA + CV Design (V1 robuste anti-overfitting)

Objectifs:
- Audit données train/test.
- Feature engineering robuste/stable.
- Construction de 2 schémas de validation:
  - `primary_time` (forward-chaining via `index`)
  - `secondary_group` (`GroupKFold` via `id_client`)
- Export des artefacts de base pour la modélisation.


In [None]:
import sys
from pathlib import Path
import json
import numpy as np
import pandas as pd

ROOT = Path.cwd()
if not (ROOT / "src").exists():
    ROOT = ROOT.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.v1_pipeline import (
    INDEX_COL,
    TARGET_FREQ_COL,
    TARGET_SEV_COL,
    load_train_test,
    prepare_datasets,
    build_primary_time_folds,
    build_secondary_group_folds,
    validate_folds_disjoint,
    export_fold_artifacts,
    ensure_dir,
)

DATA_DIR = ROOT / "data"
ARTIFACT_DIR = ensure_dir(ROOT / "artifacts")


In [None]:
train_raw, test_raw = load_train_test(DATA_DIR)
bundle = prepare_datasets(train_raw, test_raw, drop_identifiers=True)

print("train_raw:", train_raw.shape)
print("test_raw :", test_raw.shape)
print("X_train  :", bundle.X_train.shape)
print("X_test   :", bundle.X_test.shape)
print("n_features:", len(bundle.feature_cols))
print("n_cat    :", len(bundle.cat_cols))


In [None]:
# Sanity: target consistency + schema mismatch
assert TARGET_SEV_COL in train_raw.columns, "Missing montant_sinistre in train"
assert TARGET_FREQ_COL in train_raw.columns, "Missing nombre_sinistres in train"

mismatch = sorted(set(bundle.X_train.columns) ^ set(bundle.X_test.columns))
print("Schema mismatch columns:", len(mismatch))
if mismatch:
    print(mismatch[:20])

target_inconsistency = (
    ((train_raw[TARGET_FREQ_COL] == 0) & (train_raw[TARGET_SEV_COL] > 0)).sum()
    + ((train_raw[TARGET_FREQ_COL] > 0) & (train_raw[TARGET_SEV_COL] == 0)).sum()
)
print("Target inconsistency rows:", int(target_inconsistency))


In [None]:
# Distribution cible
y_freq = bundle.y_freq
y_sev = bundle.y_sev
pos = y_freq == 1

print("Claim rate:", round(float(y_freq.mean()), 6), f"({int(pos.sum())}/{len(y_freq)})")
print("Severity mean (all):", round(float(y_sev.mean()), 3))
print("Severity mean (pos):", round(float(y_sev[pos].mean()), 3))
print("Severity max:", round(float(y_sev.max()), 3))
for q in [0.5, 0.75, 0.9, 0.95, 0.99]:
    print(f"sev_pos_q{int(q*100):02d}:", round(float(np.quantile(y_sev[pos], q)), 3))


In [None]:
# Missingness + shift principal sur code_postal
missing_top = bundle.X_train.isna().mean().sort_values(ascending=False).head(15)
print("Top missing rates")
display(missing_top.to_frame("missing_rate"))

if "code_postal" in bundle.X_train.columns:
    tr_cp = bundle.X_train["code_postal"].astype(str)
    te_cp = bundle.X_test["code_postal"].astype(str)
    unseen_cp = (~te_cp.isin(set(tr_cp))).mean()
    print("Unseen code_postal ratio in test:", round(float(unseen_cp), 4))

if {"cp2", "cp3"}.issubset(bundle.X_train.columns):
    for c in ["cp2", "cp3"]:
        tr = bundle.X_train[c].astype(str)
        te = bundle.X_test[c].astype(str)
        unseen = (~te.isin(set(tr))).mean()
        print(f"Unseen {c} ratio in test:", round(float(unseen), 4))


In [None]:
# Folds primaires + secondaires
folds_primary = build_primary_time_folds(train_raw, n_blocks=5, index_col=INDEX_COL)
folds_secondary = build_secondary_group_folds(train_raw, n_splits=5, group_col="id_client")

validate_folds_disjoint(folds_primary, check_full_coverage=False)
validate_folds_disjoint(folds_secondary, check_full_coverage=True, n_rows=len(train_raw))

print("Primary folds:", {k: (len(v[0]), len(v[1])) for k, v in folds_primary.items()})
print("Secondary folds:", {k: (len(v[0]), len(v[1])) for k, v in folds_secondary.items()})


In [None]:
export_fold_artifacts(
    train=train_raw,
    primary_folds=folds_primary,
    secondary_folds=folds_secondary,
    output_dir=ARTIFACT_DIR,
)

# Artefacts complémentaires
pd.DataFrame({"cat_col": bundle.cat_cols}).to_csv(ARTIFACT_DIR / "cat_cols.csv", index=False)
pd.DataFrame({"feature_col": bundle.feature_cols}).to_csv(
    ARTIFACT_DIR / "feature_cols.csv", index=False
)

target_df = pd.DataFrame({
    "row_idx": np.arange(len(bundle.y_freq), dtype=int),
    "index": train_raw[INDEX_COL].to_numpy(),
    "y_freq": bundle.y_freq.to_numpy(),
    "y_sev": bundle.y_sev.to_numpy(),
})
target_df.to_parquet(ARTIFACT_DIR / "y_train_targets.parquet", index=False)

meta = {
    "data_dir": str(DATA_DIR),
    "n_train": int(len(train_raw)),
    "n_test": int(len(test_raw)),
    "n_features": int(len(bundle.feature_cols)),
    "n_cat": int(len(bundle.cat_cols)),
    "primary_folds": [int(k) for k in sorted(folds_primary.keys())],
    "secondary_folds": [int(k) for k in sorted(folds_secondary.keys())],
}
(ARTIFACT_DIR / "dataset_meta.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")

print("Saved:")
print("-", ARTIFACT_DIR / "folds_primary.parquet")
print("-", ARTIFACT_DIR / "folds_secondary.parquet")
print("-", ARTIFACT_DIR / "cat_cols.csv")
print("-", ARTIFACT_DIR / "feature_cols.csv")
print("-", ARTIFACT_DIR / "y_train_targets.parquet")
print("-", ARTIFACT_DIR / "dataset_meta.json")


## Notes
- `folds_primary` est un schéma temporel forward-chaining (4 folds valides sur 5 blocs).
- `folds_secondary` couvre 100% des lignes avec disjonction par `id_client`.
- On garde ces 2 splits pour la suite (`02_modeling_3_engines.ipynb`).
