# 06 - Submission report V2.1

Refit 100% train (sans holdout 90/10), calibration/tail safe,
generation de:
- `submission_v2_robust.csv`
- `submission_v2_single.csv`
et audit pre-submission.


In [1]:
import sys
import json
from pathlib import Path
import numpy as np
import pandas as pd

ROOT = Path.cwd()
if not (ROOT / "src").exists():
    ROOT = ROOT.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.v2_pipeline import (
    DEFAULT_V2_DIR,
    ensure_dir,
    load_train_test,
    prepare_feature_sets,
    fit_full_predict_fulltrain,
    fit_calibrator,
    apply_calibrator,
    fit_tail_mapper_safe,
    apply_tail_mapper_safe,
    build_submission,
    compute_prediction_distribution_audit,
    V2_COARSE_CONFIGS,
)

ARTIFACT_V2 = ensure_dir(ROOT / DEFAULT_V2_DIR)
DATA_DIR = ROOT / "data"


In [2]:
run_df = pd.read_csv(ARTIFACT_V2 / "run_registry_v2.csv")
oof = pd.read_parquet(ARTIFACT_V2 / "oof_predictions_v2.parquet")
selected = pd.read_csv(ARTIFACT_V2 / "selected_models_v2.csv")
ens_meta = json.loads((ARTIFACT_V2 / "ensemble_weights_v2.json").read_text(encoding="utf-8"))

train_raw, test_raw = load_train_test(DATA_DIR)
feature_sets = prepare_feature_sets(train_raw, test_raw, rare_min_count=30, drop_identifiers=True)

cfg_lookup = {
    engine: {cfg["config_id"]: cfg for cfg in cfgs}
    for engine, cfgs in V2_COARSE_CONFIGS.items()
}

dist_audit_path = ARTIFACT_V2 / "pred_distribution_audit_v2.csv"
dist_audit = pd.read_csv(dist_audit_path) if dist_audit_path.exists() else pd.DataFrame()


In [3]:
preds = {}
for _, row in selected.iterrows():
    run_id = row["run_id"]
    engine = row["engine"]
    fs_name = row["feature_set"]
    family = row["family"]
    tweedie_power = float(row.get("tweedie_power", 1.5))
    config_id = row["config_id"]
    seed = int(row["seed"])
    severity_mode = row["severity_mode"]
    calibration = row["calibration"]
    tail_mapper_name = row["tail_mapper"]

    cfg = cfg_lookup[engine][config_id]
    bundle = feature_sets[fs_name]

    spec = {
        "engine": engine,
        "family": family,
        "severity_mode": severity_mode,
        "tweedie_power": tweedie_power,
        "config_id": config_id,
        "freq_params": cfg["freq_params"],
        "sev_params": cfg["sev_params"],
        "direct_params": cfg["direct_params"],
        "use_target_encoding": True,
        "target_encode_cols": ["code_postal", "cp3", "modele_vehicule", "marque_modele"],
        "target_encoding_smoothing": 20.0,
    }

    out = fit_full_predict_fulltrain(spec=spec, bundle=bundle, seed=seed, complexity={})
    test_freq = out["test_freq"].copy()
    test_sev = out["test_sev"].copy()

    o = oof[(oof["is_test"] == 0) & (oof["split"] == "primary_time") & (oof["run_id"] == run_id)].copy()

    if calibration != "none" and len(o):
        ok = o["pred_freq"].notna()
        if ok.any():
            cal = fit_calibrator(
                o.loc[ok, "pred_freq"].to_numpy(),
                o.loc[ok, "y_freq"].to_numpy(),
                method=calibration,
            )
            test_freq = apply_calibrator(cal, test_freq, method=calibration)

    if tail_mapper_name != "none" and family != "direct_tweedie" and len(o):
        pos = (o["y_freq"] == 1) & o["pred_sev"].notna()
        if pos.sum() >= 80:
            mapper = fit_tail_mapper_safe(
                o.loc[pos, "pred_sev"].to_numpy(),
                o.loc[pos, "y_sev"].to_numpy(),
            )
            sev_before = test_sev.copy()
            test_sev = apply_tail_mapper_safe(mapper, test_sev)
            std_ratio = float(np.std(test_sev) / max(np.std(sev_before), 1e-9))

            q99_oof = float(np.nanquantile(o.loc[pos, "pred_sev"].to_numpy(), 0.99))
            q99_test = float(np.nanquantile(test_sev, 0.99))
            if (std_ratio < 0.70) or (q99_test < 0.60 * q99_oof):
                test_sev = sev_before

    if family == "direct_tweedie":
        pred = np.maximum(out["test_prime"], 0.0)
    else:
        pred = np.maximum(test_freq * test_sev, 0.0)
    preds[run_id] = pred

print("generated predictions for runs:", len(preds))


generated predictions for runs: 6


In [4]:
strategy = ens_meta.get("strategy", "single")
run_ids = ens_meta.get("run_ids", list(preds.keys()))
weights = ens_meta.get("weights", {})
best_single = ens_meta.get("best_single_run", run_ids[0])

if strategy == "ensemble":
    mat = np.column_stack([preds[rid] for rid in run_ids if rid in preds])
    used_ids = [rid for rid in run_ids if rid in preds]
    w = np.array([weights.get(rid, 0.0) for rid in used_ids], dtype=float)
    if w.sum() <= 0:
        w = np.full(len(used_ids), 1.0 / len(used_ids))
    else:
        w = w / w.sum()
    pred_robust = mat @ w
else:
    pred_robust = preds[best_single]

pred_single = preds[best_single]

sub_robust = build_submission(test_raw["index"], pred_robust)
sub_single = build_submission(test_raw["index"], pred_single)

sub_robust.to_csv(ARTIFACT_V2 / "submission_v2_robust.csv", index=False)
sub_single.to_csv(ARTIFACT_V2 / "submission_v2_single.csv", index=False)

print("saved:", ARTIFACT_V2 / "submission_v2_robust.csv")
print("saved:", ARTIFACT_V2 / "submission_v2_single.csv")


saved: c:\Users\icemo\Downloads\Calcul-prime-d-assurance\artifacts\v2\submission_v2_robust.csv
saved: c:\Users\icemo\Downloads\Calcul-prime-d-assurance\artifacts\v2\submission_v2_single.csv


In [5]:
robust_audit = compute_prediction_distribution_audit(
    sub_robust["pred"].to_numpy(),
    run_id="submission_v2_robust",
    split="test",
    sample="test",
)
single_audit = compute_prediction_distribution_audit(
    sub_single["pred"].to_numpy(),
    run_id="submission_v2_single",
    split="test",
    sample="test",
)

q99_oof_ref = np.nan
if not dist_audit.empty:
    rr = dist_audit[(dist_audit["sample"] == "oof") & (dist_audit["split"] == "primary_time")]
    rr = rr[rr["run_id"].isin(selected["run_id"].tolist())]
    if len(rr):
        q99_oof_ref = float(rr["pred_q99"].median())

submission_audit = {
    "n_rows_robust": int(len(sub_robust)),
    "n_rows_single": int(len(sub_single)),
    "columns_robust": sub_robust.columns.tolist(),
    "columns_single": sub_single.columns.tolist(),
    "robust_non_negative": bool((sub_robust["pred"] >= 0).all()),
    "single_non_negative": bool((sub_single["pred"] >= 0).all()),
    "robust_no_nan": bool(sub_robust["pred"].notna().all()),
    "single_no_nan": bool(sub_single["pred"].notna().all()),
    "robust_distribution": robust_audit,
    "single_distribution": single_audit,
    "q99_oof_reference_primary": None if not np.isfinite(q99_oof_ref) else q99_oof_ref,
    "q99_ratio_robust_test_over_oof_ref": None if not np.isfinite(q99_oof_ref) else float(robust_audit["pred_q99"] / max(q99_oof_ref, 1e-9)),
    "q99_ratio_single_test_over_oof_ref": None if not np.isfinite(q99_oof_ref) else float(single_audit["pred_q99"] / max(q99_oof_ref, 1e-9)),
}

(ARTIFACT_V2 / "submission_audit_v2.json").write_text(json.dumps(submission_audit, indent=2), encoding="utf-8")

report = {
    "strategy": strategy,
    "n_models_selected": int(len(selected)),
    "best_single_run": best_single,
    "submission_robust": str(ARTIFACT_V2 / "submission_v2_robust.csv"),
    "submission_single": str(ARTIFACT_V2 / "submission_v2_single.csv"),
    "submission_audit": str(ARTIFACT_V2 / "submission_audit_v2.json"),
}
(ARTIFACT_V2 / "submission_report_v2.json").write_text(json.dumps(report, indent=2), encoding="utf-8")

print("saved:", ARTIFACT_V2 / "submission_audit_v2.json")
print("saved:", ARTIFACT_V2 / "submission_report_v2.json")
print(report)


saved: c:\Users\icemo\Downloads\Calcul-prime-d-assurance\artifacts\v2\submission_audit_v2.json
saved: c:\Users\icemo\Downloads\Calcul-prime-d-assurance\artifacts\v2\submission_report_v2.json
{'strategy': 'single', 'n_models_selected': 6, 'best_single_run': 'base_v2|catboost|two_part_tweedie|cb_v2_c1|42|weighted_tail|isotonic|isotonic', 'submission_robust': 'c:\\Users\\icemo\\Downloads\\Calcul-prime-d-assurance\\artifacts\\v2\\submission_v2_robust.csv', 'submission_single': 'c:\\Users\\icemo\\Downloads\\Calcul-prime-d-assurance\\artifacts\\v2\\submission_v2_single.csv', 'submission_audit': 'c:\\Users\\icemo\\Downloads\\Calcul-prime-d-assurance\\artifacts\\v2\\submission_audit_v2.json'}
