# 03 — Ensemble + Submission (V1)

Ce notebook:
- sélectionne les runs finalistes,
- optimise les poids d’ensemble (>=0, somme=1),
- valide primaire/secondaire,
- simule 2000 shake-ups public/private,
- refit full train des finalistes,
- exporte `artifacts/submission_v1.csv` (+ `submission.csv`).


In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd

ROOT = Path.cwd()
if not (ROOT / "src").exists():
    ROOT = ROOT.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.v1_pipeline import (
    COARSE_CONFIGS,
    ensure_dir,
    load_json,
    load_train_test,
    optimize_non_negative_weights,
    prepare_datasets,
    fit_full_two_part_predict,
    fit_calibrator,
    apply_calibrator,
    simulate_public_private_shakeup,
    rmse,
    build_submission,
    save_json,
)

DATA_DIR = ROOT / "data"
ARTIFACT_DIR = ensure_dir(ROOT / "artifacts")


In [None]:
run_registry = pd.read_csv(ARTIFACT_DIR / "run_registry.csv")
oof = pd.read_parquet(ARTIFACT_DIR / "oof_predictions.parquet")
selected_configs = load_json(ARTIFACT_DIR / "selected_configs.json")

train_raw, test_raw = load_train_test(DATA_DIR)
bundle = prepare_datasets(train_raw, test_raw, drop_identifiers=True)

run_metrics = run_registry[run_registry["level"] == "run"].copy()
key_cols = ["engine", "config_id", "seed", "severity_mode", "calibration"]

def mk_run_id(df):
    return (
        df["engine"].astype(str) + "|" +
        df["config_id"].astype(str) + "|" +
        df["seed"].astype(str) + "|" +
        df["severity_mode"].astype(str) + "|" +
        df["calibration"].astype(str)
    )

run_metrics["run_id"] = mk_run_id(run_metrics)
oof["run_id"] = mk_run_id(oof)

print("Run metrics:", run_metrics.shape)
print("OOF rows:", oof.shape)


In [None]:
# Candidate runs: best par moteur sur primaire, avec garde-fou secondaire
prim = run_metrics[run_metrics["split"] == "primary_time"].copy()
sec = run_metrics[run_metrics["split"] == "secondary_group"].copy()

merged = prim.merge(
    sec[key_cols + ["rmse_prime", "q99_ratio_pos"]].rename(
        columns={
            "rmse_prime": "rmse_prime_secondary",
            "q99_ratio_pos": "q99_ratio_secondary",
        }
    ),
    on=key_cols,
    how="left",
)
merged["rmse_gap_secondary_minus_primary"] = (
    merged["rmse_prime_secondary"] - merged["rmse_prime"]
)

# limiter aux configs sélectionnées dans notebook 02
mask_sel = np.zeros(len(merged), dtype=bool)
for engine, cfg_ids in selected_configs.items():
    mask_sel |= (merged["engine"].eq(engine) & merged["config_id"].isin(cfg_ids))
merged = merged[mask_sel].copy()

finalists = []
for engine, g in merged.groupby("engine"):
    # garde-fou overfit secondaire si disponible
    g_ok = g[g["rmse_gap_secondary_minus_primary"].fillna(0) <= 10.0]
    if g_ok.empty:
        g_ok = g
    best = g_ok.sort_values(["rmse_prime", "rmse_prime_secondary"]).head(1)
    finalists.append(best)
finalists = pd.concat(finalists, ignore_index=True)
finalists["run_id"] = mk_run_id(finalists)

finalists


In [None]:
# Matrices prédiction pour optimisation des poids
def build_matrix(pred_df, split_name, run_ids, is_test=0):
    d = pred_df[(pred_df["split"] == split_name) & (pred_df["is_test"] == is_test)].copy()
    d = d[d["run_id"].isin(run_ids)]
    wide = d.pivot_table(index="row_idx", columns="run_id", values="pred_prime", aggfunc="first")
    y = (
        d.groupby("row_idx")["y_sev"].first()
        if is_test == 0
        else pd.Series(index=wide.index, dtype=float)
    )
    return wide, y

run_ids = finalists["run_id"].tolist()

Xp, yp = build_matrix(oof, "primary_time", run_ids, is_test=0)
mask = Xp.notna().all(axis=1)
Xp_fit = Xp.loc[mask]
yp_fit = yp.loc[mask]

weights = optimize_non_negative_weights(Xp_fit.values, yp_fit.values)
weight_map = {rid: float(w) for rid, w in zip(Xp_fit.columns.tolist(), weights)}
weight_map


In [None]:
# Validation primaire + secondaire + fallback single
ens_primary_pred = Xp_fit.values @ weights
ens_primary_rmse = rmse(yp_fit.values, ens_primary_pred)

Xs, ys = build_matrix(oof, "secondary_group", run_ids, is_test=0)
sec_mask = Xs.notna().all(axis=1)
if sec_mask.any():
    ens_secondary_pred = Xs.loc[sec_mask].values @ weights
    ens_secondary_rmse = rmse(ys.loc[sec_mask].values, ens_secondary_pred)
else:
    ens_secondary_rmse = np.nan

# best single (sur secondaire si dispo, sinon primaire)
if sec_mask.any():
    single_scores = []
    for rid in run_ids:
        p = Xs.loc[sec_mask, rid].values
        s = rmse(ys.loc[sec_mask].values, p)
        single_scores.append((rid, s))
else:
    single_scores = []
    for rid in run_ids:
        p = Xp_fit[rid].values
        s = rmse(yp_fit.values, p)
        single_scores.append((rid, s))
best_single_run, best_single_rmse = sorted(single_scores, key=lambda x: x[1])[0]

print("Ensemble RMSE primary:", round(float(ens_primary_rmse), 6))
print("Ensemble RMSE secondary:", round(float(ens_secondary_rmse), 6) if not np.isnan(ens_secondary_rmse) else np.nan)
print("Best single run:", best_single_run)
print("Best single RMSE :", round(float(best_single_rmse), 6))


In [None]:
# Shake-up simulation (public/private)
shake_ens = simulate_public_private_shakeup(
    yp_fit.values, ens_primary_pred, n_sim=2000, public_ratio=1/3, seed=42
)
shake_single = simulate_public_private_shakeup(
    yp_fit.values, Xp_fit[best_single_run].values, n_sim=2000, public_ratio=1/3, seed=42
)

ens_gap_std = float(shake_ens["gap_public_minus_private"].std())
single_gap_std = float(shake_single["gap_public_minus_private"].std())

# Décision finale: ensemble seulement s'il n'est pas instable ni pire en secondaire
use_ensemble = True
if not np.isnan(ens_secondary_rmse):
    if ens_secondary_rmse > best_single_rmse + 1.0:
        use_ensemble = False
if ens_gap_std > single_gap_std * 1.05:
    use_ensemble = False

print("ens_gap_std:", round(ens_gap_std, 6))
print("single_gap_std:", round(single_gap_std, 6))
print("use_ensemble:", use_ensemble)

shake_ens.to_parquet(ARTIFACT_DIR / "shakeup_ensemble.parquet", index=False)
shake_single.to_parquet(ARTIFACT_DIR / "shakeup_single.parquet", index=False)


In [None]:
# Refit full train des runs finalistes puis prédiction test
cfg_lookup = {
    engine: {c["config_id"]: c for c in cfgs}
    for engine, cfgs in COARSE_CONFIGS.items()
}

full_test_preds = {}
y_freq_np = bundle.y_freq.to_numpy(dtype=int)
y_sev_np = bundle.y_sev.to_numpy(dtype=float)

for _, r in finalists.iterrows():
    rid = r["run_id"]
    engine = r["engine"]
    config_id = r["config_id"]
    seed = int(r["seed"])
    severity_mode = r["severity_mode"]
    calibration = r["calibration"]

    cfg = cfg_lookup[engine][config_id]
    freq_raw_te, sev_te = fit_full_two_part_predict(
        engine=engine,
        X_train=bundle.X_train,
        y_freq_train=y_freq_np,
        y_sev_train=y_sev_np,
        X_test=bundle.X_test,
        cat_cols=bundle.cat_cols,
        seed=seed,
        severity_mode=severity_mode,
        freq_params=cfg["freq_params"],
        sev_params=cfg["sev_params"],
    )

    if calibration != "none":
        # calibrateur appris sur OOF (split primaire)
        oof_run = oof[
            (oof["is_test"] == 0)
            & (oof["split"] == "primary_time")
            & (oof["run_id"] == rid)
        ].copy()
        valid = oof_run["pred_freq"].notna()
        cal = fit_calibrator(
            oof_run.loc[valid, "pred_freq"].to_numpy(),
            oof_run.loc[valid, "y_freq"].to_numpy(),
            method=calibration,
        )
        freq_te = apply_calibrator(cal, freq_raw_te, method=calibration)
    else:
        freq_te = freq_raw_te

    full_test_preds[rid] = np.maximum(freq_te * sev_te, 0.0)


In [None]:
# Combinaison finale + export submission
if use_ensemble:
    final_runs = Xp_fit.columns.tolist()
    w = np.array([weight_map[rid] for rid in final_runs], dtype=float)
    test_matrix = np.column_stack([full_test_preds[rid] for rid in final_runs])
    pred_final = test_matrix @ w
    final_strategy = "ensemble"
else:
    pred_final = full_test_preds[best_single_run]
    final_strategy = "single"

submission = build_submission(test_raw["index"], pred_final)
submission.to_csv(ARTIFACT_DIR / "submission_v1.csv", index=False)
submission.to_csv(ARTIFACT_DIR / "submission.csv", index=False)

final_meta = {
    "final_strategy": final_strategy,
    "final_runs": finalists["run_id"].tolist(),
    "weights": weight_map,
    "best_single_run": best_single_run,
    "ens_primary_rmse": float(ens_primary_rmse),
    "ens_secondary_rmse": float(ens_secondary_rmse) if not np.isnan(ens_secondary_rmse) else None,
    "best_single_rmse": float(best_single_rmse),
    "ens_gap_std": float(ens_gap_std),
    "single_gap_std": float(single_gap_std),
}
save_json(final_meta, ARTIFACT_DIR / "ensemble_weights_v1.json")
finalists.to_csv(ARTIFACT_DIR / "finalist_runs.csv", index=False)

print("Saved:")
print("-", ARTIFACT_DIR / "submission_v1.csv")
print("-", ARTIFACT_DIR / "submission.csv")
print("-", ARTIFACT_DIR / "ensemble_weights_v1.json")
print("-", ARTIFACT_DIR / "finalist_runs.csv")
submission.head()
