# ENS Data Challenge 2025 – AssurPrime

This notebook implements an end-to-end pipeline to predict the insurance charge **CHARGE** from anonymized client and contract data.

**Workflow**
1. Load raw data and set up repository paths.
2. Preprocess and encode features; keep test IDs for the final submission.
3. Deterministic split: **1%** (development/tuning) and **99%** (holdout validation).
4. SHAP-based mixed feature ranking (separate models for FREQ and CM).
5. Hyperparameter tuning with Optuna for model and post-processing parameters.
6. Final training on selected features and evaluation on the 99% holdout.
7. Submission file generation for the challenge platform.

Expected repository structure:
```
data/
  raw/         # challenge CSVs (download separately; not versioned)
  processed/   # intermediate artifacts (generated locally)
models/        # saved params/models (optional)
results/       # figures and submission files
```


In [None]:
# Core
import os
import json
import numpy as np
import pandas as pd

# Machine Learning
import xgboost as xgb
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Explainability
import shap

# Repository directories (relative paths)
RAW_DIR = "data/raw"
PROC_DIR = "data/processed"
MODELS_DIR = "models"
RESULTS_DIR = "results"

os.makedirs(PROC_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Raw challenge CSVs (place them manually in data/raw/)
PATH_X_RAW   = f"{RAW_DIR}/train_input_Z61KlZo.csv"
PATH_Y_RAW   = f"{RAW_DIR}/train_output_DzPxaPY.csv"
PATH_TEST_RAW= f"{RAW_DIR}/test_input_5qJzHrr.csv"


In [None]:
# Load raw datasets
X_raw    = pd.read_csv(PATH_X_RAW)
y_raw    = pd.read_csv(PATH_Y_RAW)
test_raw = pd.read_csv(PATH_TEST_RAW)

print("Loaded raw data.")
print("X_raw:", X_raw.shape, "| y_raw:", y_raw.shape, "| test_raw:", test_raw.shape)


In [None]:
# Save test IDs/ANNEE_ASSURANCE for submission
test_ids = test_raw[["ID", "ANNEE_ASSURANCE"]].copy()
test_ids.to_csv(f"{PROC_DIR}/test_ids.csv", index=False)

# Drop ID/ANNEE from training/test before encoding
for df in (X_raw, test_raw):
    for col in ("ID", "ANNEE_ASSURANCE"):
        if col in df.columns:
            df.drop(columns=col, inplace=True)

# Encode categorical columns and impute numeric NaNs with median
def encode_impute(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in df.columns:
        if df[c].dtype == "object":
            df[c] = df[c].astype("category").cat.codes
    df = df.fillna(df.median(numeric_only=True))
    return df

X          = encode_impute(X_raw)
X_test_enc = encode_impute(test_raw)
y          = y_raw.copy()

# Persist processed artifacts (optional, for reproducibility)
X.to_csv(f"{PROC_DIR}/train_input_encoded.csv", index=False)
y.to_csv(f"{PROC_DIR}/train_output_encoded.csv", index=False)
X_test_enc.to_csv(f"{PROC_DIR}/test_input_encoded.csv", index=False)

print("Preprocessing complete.")
print("X:", X.shape, "| X_test_enc:", X_test_enc.shape)


In [None]:
# Deterministic 1% / 99% split by rows
rng = 42
X_01 = X.sample(frac=0.01, random_state=rng)
y_01 = y.loc[X_01.index].copy()

X_99 = X.drop(X_01.index)
y_99 = y.drop(X_01.index)

# Save splits (optional)
X_01.to_csv(f"{PROC_DIR}/train_input_reduit_0_01.csv", index=False)
y_01.to_csv(f"{PROC_DIR}/train_output_reduit_0_01.csv", index=False)
X_99.to_csv(f"{PROC_DIR}/train_input_reduit_0_99.csv", index=False)
y_99.to_csv(f"{PROC_DIR}/train_output_reduit_0_99.csv", index=False)

print("Split complete.")
print("1%:", X_01.shape, "| 99%:", X_99.shape)


In [None]:
# Base learner for SHAP estimation
xgb_base = dict(
    objective="reg:squarederror",
    n_estimators=300,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    random_state=42
)

# SHAP on FREQ (1%)
xgb_freq = xgb.XGBRegressor(max_depth=6, **xgb_base)
xgb_freq.fit(X_01, y_01["FREQ"])
expl_freq = shap.Explainer(xgb_freq, X_01)
shap_freq = pd.DataFrame(expl_freq(X_01).values, columns=X_01.columns).abs().mean()

# SHAP on CM (1%)
xgb_cm = xgb.XGBRegressor(max_depth=6, **xgb_base)
xgb_cm.fit(X_01, y_01["CM"])
expl_cm = shap.Explainer(xgb_cm, X_01)
shap_cm = pd.DataFrame(expl_cm(X_01).values, columns=X_01.columns).abs().mean()

# Mixing weight and number of features (use your tuned values if known)
W_FREQ = 0.2807980997051884
HEAD_N = 10

combined_shap = (W_FREQ * shap_freq + (1 - W_FREQ) * shap_cm).sort_values(ascending=False)
top_vars = combined_shap.head(HEAD_N).index.tolist()

print("Selected features:", top_vars)


In [None]:
import optuna

# Holdout target (99%)
annee_assurance_99 = y_99["ANNEE_ASSURANCE"].to_numpy()
charge_real_99 = (y_99["FREQ"] * y_99["CM"] * y_99["ANNEE_ASSURANCE"]).to_numpy()

# Search space
PARAM_RANGES = {
    "w_freq":        (0.01, 0.4),
    "head_n":        (6, 20),
    "max_depth_freq": (2, 10),
    "lr_freq":       (0.01, 0.12),
    "subsample_freq": (0.6, 1.0),
    "colsample_freq": (0.6, 1.0),
    "lr_cm":         (0.02, 0.1),
    "max_depth_cm":  (2, 10),
    "a":             (0.005, 0.25),
    "b":             (1.3, 2.6),
    "c":             (9, 17),
    "d":             (0.15, 0.45),
    "log_base":      (1.2, 3.2),
}

def objective(trial):
    # Sample parameters
    p = {}
    for k, (lo, hi) in PARAM_RANGES.items():
        if isinstance(lo, int) and isinstance(hi, int):
            p[k] = trial.suggest_int(k, lo, hi)
        else:
            p[k] = trial.suggest_float(k, lo, hi)

    # Mixed SHAP feature selection
    combined = (p["w_freq"] * shap_freq + (1 - p["w_freq"]) * shap_cm).sort_values(ascending=False)
    top = combined.head(p["head_n"]).index.tolist()

    X_tr = X_01[top]
    X_va = X_99[top]

    # Models
    freq_model = xgb.XGBRegressor(
        max_depth=p["max_depth_freq"], n_estimators=300, learning_rate=p["lr_freq"],
        subsample=p["subsample_freq"], colsample_bytree=p["colsample_freq"],
        objective="reg:squarederror", n_jobs=-1, random_state=42
    )
    freq_model.fit(X_tr, y_01["FREQ"])

    cm_model = HistGradientBoostingRegressor(
        max_iter=300, learning_rate=p["lr_cm"], max_depth=p["max_depth_cm"], random_state=42
    )
    cm_model.fit(X_tr, y_01["CM"])

    # Predictions on 99%
    freq_pred = freq_model.predict(X_va)
    cm_pred   = cm_model.predict(X_va)

    # Charge recomposition (with numeric safeguards)
    freq_term = np.power(np.abs(freq_pred), p["a"])
    log_cm = np.log(np.clip(np.abs(cm_pred), 1e-8, None)) / np.log(max(p["log_base"], 1e-8))
    base_cm = np.clip(log_cm + p["c"], 1e-8, None)
    cm_term = np.power(base_cm, p["b"])
    ann_term = np.power(np.log1p(np.clip(annee_assurance_99, 0.0, None)), p["d"])

    charge_pred = freq_term * cm_term * ann_term
    if np.isnan(charge_pred).any():
        return float("inf")

    rmse = np.sqrt(mean_squared_error(charge_real_99, charge_pred))
    return rmse

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)  # Here only 10 trials but we let it run for days

print("Best params:", study.best_params)
print("Best RMSE (99% holdout):", study.best_value)

# Save best params
with open(f"{MODELS_DIR}/best_params.json", "w") as f:
    json.dump(study.best_params, f, indent=2)


In [None]:
# Load best parameters
with open(f"{MODELS_DIR}/best_params.json") as f:
    params = json.load(f)

# Recompute mixed SHAP ranking with stored w_freq/head_n
combined_final = (params["w_freq"] * shap_freq + (1 - params["w_freq"]) * shap_cm).sort_values(ascending=False)
top_vars_final = combined_final.head(params["head_n"]).index.tolist()

# Train on 1% with final params and validate on 99%
X_tr_best = X_01[top_vars_final]
X_va_best = X_99[top_vars_final]

freq_model = xgb.XGBRegressor(
    max_depth=params["max_depth_freq"], n_estimators=300, learning_rate=params["lr_freq"],
    subsample=params["subsample_freq"], colsample_bytree=params["colsample_freq"],
    objective="reg:squarederror", n_jobs=-1, random_state=42
)
freq_model.fit(X_tr_best, y_01["FREQ"])

cm_model = HistGradientBoostingRegressor(
    max_iter=300, learning_rate=params["lr_cm"], max_depth=params["max_depth_cm"], random_state=42
)
cm_model.fit(X_tr_best, y_01["CM"])

# Validation on 99%
freq_pred = freq_model.predict(X_va_best)
cm_pred   = cm_model.predict(X_va_best)
annee_99  = y_99["ANNEE_ASSURANCE"].to_numpy()

freq_term = np.power(np.abs(freq_pred), params["a"])
log_cm = np.log(np.clip(np.abs(cm_pred), 1e-8, None)) / np.log(max(params["log_base"], 1e-8))
base_cm = np.clip(log_cm + params["c"], 1e-8, None)
cm_term = np.power(base_cm, params["b"])
ann_term = np.power(np.log1p(np.clip(annee_99, 0.0, None)), params["d"])

charge_pred = freq_term * cm_term * ann_term
charge_real = (y_99["FREQ"] * y_99["CM"] * y_99["ANNEE_ASSURANCE"]).to_numpy()

rmse = np.sqrt(mean_squared_error(charge_real, charge_pred))
print(f"RMSE on 99% holdout: {rmse:.6f}")

# Fit final models on the full training set (for submission)
X_full_top = X[top_vars_final]
freq_model.fit(X_full_top, y["FREQ"])
cm_model.fit(X_full_top, y["CM"])


In [None]:
# Prepare submission features
X_submit = pd.read_csv(f"{PROC_DIR}/test_input_encoded.csv")
ids_df   = pd.read_csv(f"{PROC_DIR}/test_ids.csv")

X_submit_best = X_submit[top_vars_final]

# Predict on test
freq_submit = freq_model.predict(X_submit_best)
cm_submit   = cm_model.predict(X_submit_best)
annee_submit= ids_df["ANNEE_ASSURANCE"].to_numpy()

# Recompose CHARGE with numeric safeguards
freq_term_s = np.power(np.abs(freq_submit), params["a"])
log_cm_s = np.log(np.clip(np.abs(cm_submit), 1e-8, None)) / np.log(max(params["log_base"], 1e-8))
base_cm_s = np.clip(log_cm_s + params["c"], 1e-8, None)
cm_term_s = np.power(base_cm_s, params["b"])
ann_term_s= np.power(np.log1p(np.clip(annee_submit, 0.0, None)), params["d"])

charge_submit = freq_term_s * cm_term_s * ann_term_s

# Build submission
submission = pd.DataFrame({
    "ID": ids_df["ID"],
    "FREQ": freq_submit,
    "CM": cm_submit,
    "ANNEE_ASSURANCE": annee_submit,
    "CHARGE": charge_submit
})
submission.to_csv(f"{RESULTS_DIR}/submission.csv", index=False)
print("Submission saved at:", f"{RESULTS_DIR}/submission.csv")
