# 06 - Submission Report V2

Refit full train des runs sélectionnés et génération:
- `submission_v2_robust.csv`
- `submission_v2_single.csv`


In [1]:
import sys
from pathlib import Path
import json
import numpy as np
import pandas as pd

ROOT = Path.cwd()
if not (ROOT / "src").exists():
    ROOT = ROOT.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.v2_pipeline import (
    DEFAULT_V2_DIR,
    ensure_dir,
    load_train_test,
    prepare_feature_sets,
    fit_full_predict,
    fit_calibrator,
    apply_calibrator,
    fit_tail_mapper,
    apply_tail_mapper,
    build_submission,
    V2_COARSE_CONFIGS,
)
ARTIFACT_V2 = ensure_dir(ROOT / DEFAULT_V2_DIR)
DATA_DIR = ROOT / "data"


In [2]:
run_df = pd.read_csv(ARTIFACT_V2 / "run_registry_v2.csv")
oof = pd.read_parquet(ARTIFACT_V2 / "oof_predictions_v2.parquet")
selected = pd.read_csv(ARTIFACT_V2 / "selected_models_v2.csv")
ens_meta = json.loads((ARTIFACT_V2 / "ensemble_weights_v2.json").read_text(encoding="utf-8"))

train_raw, test_raw = load_train_test(DATA_DIR)
feature_sets = prepare_feature_sets(train_raw, test_raw, rare_min_count=30, drop_identifiers=True)

cfg_lookup = {
    engine: {c["config_id"]: c for c in cfgs}
    for engine, cfgs in V2_COARSE_CONFIGS.items()
}


In [3]:
preds = {}
for _, r in selected.iterrows():
    run_id = r["run_id"]
    fs_name = r["feature_set"]
    bundle = feature_sets[fs_name]
    engine = r["engine"]
    config_id = r["config_id"]
    seed = int(r["seed"])
    family = r["family"]
    sev_mode = r["severity_mode"]
    calib = r["calibration"]
    tail_name = r["tail_mapper"]
    cfg = cfg_lookup[engine][config_id]

    spec = {
        "engine": engine,
        "family": family,
        "severity_mode": sev_mode,
        "config_id": config_id,
        "freq_params": cfg["freq_params"],
        "sev_params": cfg["sev_params"],
        "direct_params": cfg["direct_params"],
        "use_target_encoding": True,
        "target_encode_cols": ["code_postal", "cp3", "modele_vehicule", "marque_modele"],
        "target_encoding_smoothing": 20.0,
    }
    out = fit_full_predict(spec=spec, bundle=bundle, seed=seed, valid_ratio=0.1)
    test_freq = out["test_freq"].copy()
    test_sev = out["test_sev"].copy()
    test_prime = out["test_prime"].copy()

    if calib != "none":
        o = oof[(oof["is_test"] == 0) & (oof["split"] == "primary_time") & (oof["run_id"] == run_id)].copy()
        ok = o["pred_freq"].notna()
        if ok.any():
            cal = fit_calibrator(
                o.loc[ok, "pred_freq"].to_numpy(),
                o.loc[ok, "y_freq"].to_numpy(),
                method=calib,
            )
            test_freq = apply_calibrator(cal, test_freq, method=calib)

    if tail_name != "none" and family != "direct_tweedie":
        o = oof[(oof["is_test"] == 0) & (oof["split"] == "primary_time") & (oof["run_id"] == run_id)].copy()
        pos = (o["y_freq"] == 1) & o["pred_sev"].notna()
        if pos.any():
            mapper = fit_tail_mapper(
                o.loc[pos, "pred_sev"].to_numpy(),
                o.loc[pos, "y_sev"].to_numpy(),
            )
            test_sev = apply_tail_mapper(mapper, test_sev)

    if family == "direct_tweedie":
        pred = np.maximum(test_prime, 0.0)
    else:
        pred = np.maximum(test_freq * test_sev, 0.0)
    preds[run_id] = pred

print("ready predictions:", len(preds))


ready predictions: 6


In [4]:
strategy = ens_meta.get("strategy", "single")
run_ids = ens_meta.get("run_ids", list(preds.keys()))
weights = ens_meta.get("weights", {})
best_single = ens_meta.get("best_single_run")

if strategy == "ensemble":
    mat = np.column_stack([preds[rid] for rid in run_ids])
    w = np.array([weights.get(rid, 0.0) for rid in run_ids], dtype=float)
    if w.sum() <= 0:
        w = np.full(len(run_ids), 1.0 / len(run_ids))
    else:
        w = w / w.sum()
    pred_robust = mat @ w
else:
    pred_robust = preds[best_single]

pred_single = preds[best_single]

sub_robust = build_submission(test_raw["index"], pred_robust)
sub_single = build_submission(test_raw["index"], pred_single)
sub_robust.to_csv(ARTIFACT_V2 / "submission_v2_robust.csv", index=False)
sub_single.to_csv(ARTIFACT_V2 / "submission_v2_single.csv", index=False)

report = {
    "strategy": strategy,
    "n_runs": len(run_ids),
    "best_single_run": best_single,
    "submission_robust": str(ARTIFACT_V2 / "submission_v2_robust.csv"),
    "submission_single": str(ARTIFACT_V2 / "submission_v2_single.csv"),
}
(ARTIFACT_V2 / "submission_report_v2.json").write_text(json.dumps(report, indent=2), encoding="utf-8")
print(report)


{'strategy': 'single', 'n_runs': 6, 'best_single_run': 'base_v2|catboost|two_part_tweedie|cb_v2_c1|42|weighted_tail|isotonic|isotonic', 'submission_robust': 'c:\\Users\\icemo\\Downloads\\Calcul-prime-d-assurance\\artifacts\\v2\\submission_v2_robust.csv', 'submission_single': 'c:\\Users\\icemo\\Downloads\\Calcul-prime-d-assurance\\artifacts\\v2\\submission_v2_single.csv'}
