In [None]:
# Core
import os, json
import numpy as np
import pandas as pd

# ML
import xgboost as xgb
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Explainability
import shap

# Dossiers (relatifs au repo)
RAW_DIR = "data/raw"
PROC_DIR = "data/processed"
MODELS_DIR = "models"
RESULTS_DIR = "results"
os.makedirs(PROC_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Fichiers bruts attendus (tels que fournis par le challenge)
PATH_X_RAW   = f"{RAW_DIR}/train_input_Z61KlZo.csv"
PATH_Y_RAW   = f"{RAW_DIR}/train_output_DzPxaPY.csv"
PATH_TEST_RAW= f"{RAW_DIR}/test_input_5qJzHrr.csv"

In [None]:
# Chargement des fichiers bruts
X_raw    = pd.read_csv(PATH_X_RAW)
y_raw    = pd.read_csv(PATH_Y_RAW)
test_raw = pd.read_csv(PATH_TEST_RAW)

# Sauvegarde des colonnes ID & ANNEE_ASSURANCE du test (utiles pour la soumission)
test_ids = test_raw[["ID", "ANNEE_ASSURANCE"]].copy()
test_ids.to_csv(f"{PROC_DIR}/test_ids.csv", index=False)

print("Shapes — X_raw:", X_raw.shape, "| y_raw:", y_raw.shape, "| test_raw:", test_raw.shape)

In [None]:
def preprocess(df: pd.DataFrame, drop_cols=("ID","ANNEE_ASSURANCE")) -> pd.DataFrame:
    df = df.copy()
    # 1) Drop ID/ANNEE si présents
    for c in drop_cols:
        if c in df.columns:
            df.drop(columns=c, inplace=True)
    # 2) Encodage catégorielles (codes)
    for c in df.columns:
        if df[c].dtype == "object":
            df[c] = df[c].astype("category").cat.codes
    # 3) Imputation médiane (évite NaN pour modèles/SHAP)
    df = df.fillna(df.median(numeric_only=True))
    return df

X          = preprocess(X_raw)
X_test_enc = preprocess(test_raw)
y          = y_raw.copy()

# (optionnel) sauvegarde encodée
X.to_csv(f"{PROC_DIR}/train_input_encoded.csv", index=False)
y.to_csv(f"{PROC_DIR}/train_output_encoded.csv", index=False)
X_test_enc.to_csv(f"{PROC_DIR}/test_input_encoded.csv", index=False)

print("After preprocess — X:", X.shape, "| X_test_enc:", X_test_enc.shape)


In [None]:
# Split 1% (dev) / 99% (holdout)
frac_dev = 0.01
rng = 42

X_01 = X.sample(frac=frac_dev, random_state=rng)
y_01 = y.loc[X_01.index].copy()

X_99 = X.drop(X_01.index)
y_99 = y.drop(X_01.index)

print("Split — 1%:", X_01.shape, "| 99%:", X_99.shape)


In [None]:
xgb_base = dict(
    objective='reg:squarederror',
    n_estimators=300,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    random_state=42
)

# SHAP FREQ (1%)
xgb_freq = xgb.XGBRegressor(max_depth=6, **xgb_base)
xgb_freq.fit(X_01, y_01["FREQ"])
expl_freq = shap.Explainer(xgb_freq, X_01)
shap_freq = pd.DataFrame(expl_freq(X_01).values, columns=X_01.columns).abs().mean()

# SHAP CM (1%)
xgb_cm = xgb.XGBRegressor(max_depth=6, **xgb_base)
xgb_cm.fit(X_01, y_01["CM"])
expl_cm = shap.Explainer(xgb_cm, X_01)
shap_cm = pd.DataFrame(expl_cm(X_01).values, columns=X_01.columns).abs().mean()

# Poids & nombre de variables (issus de ton tuning)
W_FREQ = 0.2807980997051884
HEAD_N = 10

combined = (W_FREQ * shap_freq + (1 - W_FREQ) * shap_cm).sort_values(ascending=False)
top_vars = combined.head(HEAD_N).index.tolist()

top_vars


In [None]:
# Sous-ensembles top features
Xtr = X_01[top_vars]   # train = 1%
Xva = X_99[top_vars]   # validation = 99%

# Modèle FREQ (params issus de ton trial 8557)
freq_model = xgb.XGBRegressor(
    max_depth=5, n_estimators=300, learning_rate=0.10395715468545126,
    subsample=0.6920465310616473, colsample_bytree=0.7869785849086548,
    objective='reg:squarederror', n_jobs=-1, random_state=42
)
freq_model.fit(Xtr, y_01["FREQ"])

# Modèle CM (params issus de ton trial 8557)
cm_model = HistGradientBoostingRegressor(
    max_iter=300, learning_rate=0.025922727929923714, max_depth=9, random_state=42
)
cm_model.fit(Xtr, y_01["CM"])


In [None]:
def recompose_charge(freq_pred, cm_pred, annee_arr,
                     a=0.2455497358609047, b=2.5044659323864145,
                     c=10, d=0.33307032562550604, log_base=2.375825235200949):
    # protections numériques
    freq_pred = np.nan_to_num(freq_pred, nan=0.0)
    cm_pred   = np.nan_to_num(cm_pred,   nan=0.0)
    annee_arr = np.nan_to_num(annee_arr, nan=0.0)

    freq_term = np.power(np.abs(freq_pred), a)
    log_cm = np.log(np.clip(np.abs(cm_pred), 1e-12, None)) / np.log(max(log_base, 1e-12))
    base_cm = np.clip(log_cm + c, 1e-12, None)
    cm_term = np.power(base_cm, b)
    ann_term = np.power(np.log1p(np.clip(annee_arr, 0.0, None)), d)
    return freq_term * cm_term * ann_term

# Évaluation sur 99%
freq_pred = freq_model.predict(Xva)
cm_pred   = cm_model.predict(Xva)
charge_pred = recompose_charge(freq_pred, cm_pred, y_99["ANNEE_ASSURANCE"].to_numpy())
charge_real = (y_99["FREQ"] * y_99["CM"] * y_99["ANNEE_ASSURANCE"]).to_numpy()
rmse = np.sqrt(mean_squared_error(charge_real, charge_pred))
rmse


In [None]:
# Refit des modèles sur tout le train (top_vars)
X_full_top = X[top_vars]
freq_model.fit(X_full_top, y["FREQ"])
cm_model.fit(X_full_top, y["CM"])

# Test encodé (top_vars)
X_test_top = X_test_enc[top_vars]

# Prédictions test
freq_submit = freq_model.predict(X_test_top)
cm_submit   = cm_model.predict(X_test_top)

# Recomposition CHARGE (avec IDs/ANNEE du test)
test_ids = pd.read_csv(f"{PROC_DIR}/test_ids.csv")
charge_submit = recompose_charge(freq_submit, cm_submit, test_ids["ANNEE_ASSURANCE"].to_numpy())

# DataFrame de soumission
submission = pd.DataFrame({
    "ID": test_ids["ID"],
    "FREQ": freq_submit,
    "CM": cm_submit,
    "ANNEE_ASSURANCE": test_ids["ANNEE_ASSURANCE"],
    "CHARGE": charge_submit
})
submission.to_csv(f"{RESULTS_DIR}/submission.csv", index=False)
submission.head()


In [None]:
# Sauvegarde des top features & hyperparamètres utiles
with open(f"{MODELS_DIR}/top_features.json", "w") as f:
    json.dump(top_vars, f)

best_params = {
    "w_freq": W_FREQ, "head_n": len(top_vars),
    "max_depth_freq": 5, "lr_freq": 0.10395715468545126,
    "subsample_freq": 0.6920465310616473, "colsample_freq": 0.7869785849086548,
    "lr_cm": 0.025922727929923714, "max_depth_cm": 9,
    "a": 0.2455497358609047, "b": 2.5044659323864145,
    "c": 10, "d": 0.33307032562550604, "log_base": 2.375825235200949
}
with open(f"{MODELS_DIR}/best_params.json", "w") as f:
    json.dump(best_params, f)
print("Saved:", f"{MODELS_DIR}/top_features.json", "and", f"{MODELS_DIR}/best_params.json")
