In [10]:
import os
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import xgboost as xgb
import joblib


# =========================
# 1) Split temporel panel
# =========================
df = model_df.copy()
df = df.dropna(subset=["yield"]).copy()

if "scenario" in df.columns:
    df = df[df["scenario"] == "historical"].copy()

df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
df = df.dropna(subset=["year"]).copy()
df["year"] = df["year"].astype(int)

df = df.sort_values(["nom_dep", "year"]).reset_index(drop=True)

TARGET = "yield"
EXCLUDE_COLS = [TARGET]
if "scenario" in df.columns:
    EXCLUDE_COLS.append("scenario")

X = df.drop(columns=EXCLUDE_COLS)
y = df[TARGET].astype(float)

FEATURE_COLUMNS = X.columns.tolist()

train_mask = df["year"] <= 2012
val_mask   = (df["year"] >= 2013) & (df["year"] <= 2016)
test_mask  = df["year"] >= 2017

X_train, y_train = X.loc[train_mask], y.loc[train_mask]
X_val,   y_val   = X.loc[val_mask],   y.loc[val_mask]
X_test,  y_test  = X.loc[test_mask],  y.loc[test_mask]

print("Train:", X_train.shape, "| Val:", X_val.shape, "| Test:", X_test.shape)


# =========================
# 2) Preprocessing (fit sur TRAIN)
# =========================
numeric_features = X_train.select_dtypes(include=["number", "float", "int", "Int64"]).columns.tolist()
categorical_features = [c for c in X_train.columns if c not in numeric_features]

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median"))]), numeric_features),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
        ]), categorical_features),
    ],
    remainder="drop",
)

Xtr = preprocess.fit_transform(X_train)
Xva = preprocess.transform(X_val)
Xte = preprocess.transform(X_test)


# =========================
# 3) XGBoost native training (DMatrix) + early stopping
# =========================
dtrain = xgb.DMatrix(Xtr, label=y_train.values)
dval   = xgb.DMatrix(Xva, label=y_val.values)
dtest  = xgb.DMatrix(Xte, label=y_test.values)

params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "eta": 0.02,           # learning_rate
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "lambda": 1.0,
    "alpha": 0.0,
    "seed": 42,
}

num_boost_round = 5000
early_stopping_rounds = 200

evals = [(dtrain, "train"), (dval, "val")]

booster = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=num_boost_round,
    evals=evals,
    early_stopping_rounds=early_stopping_rounds,
    verbose_eval=False,
)

best_iter = booster.best_iteration
print("Best iteration:", best_iter)


# =========================
# 4) Refit final sur TRAIN+VAL avec best_iter+1
# =========================
X_trainval = pd.concat([X_train, X_val], axis=0)
y_trainval = pd.concat([y_train, y_val], axis=0)

# Refit preprocess on train+val (recommended)
Xtv = preprocess.fit_transform(X_trainval)
Xte_final = preprocess.transform(X_test)

dtrainval = xgb.DMatrix(Xtv, label=y_trainval.values)
dtest2    = xgb.DMatrix(Xte_final, label=y_test.values)

booster_final = xgb.train(
    params=params,
    dtrain=dtrainval,
    num_boost_round=int(best_iter) + 1,
    evals=[(dtrainval, "trainval")],
    verbose_eval=False,
)


# =========================
# 5) Eval + export preds
# =========================
def metrics(y_true, y_pred):
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    mae  = float(mean_absolute_error(y_true, y_pred))
    r2   = float(r2_score(y_true, y_pred))
    return rmse, mae, r2

pred_test = booster_final.predict(dtest2)

rmse, mae, r2 = metrics(y_test, pred_test)
print("\n=== TEST (2017+) ===")
print("RMSE:", rmse)
print("MAE :", mae)
print("R2  :", r2)

preds_test = X_test[["nom_dep", "year"]].copy()
preds_test["y_true"] = y_test.values
preds_test["y_pred"] = pred_test

os.makedirs("outputs", exist_ok=True)
preds_test.to_csv("outputs/preds_test_2017plus.csv", index=False)
preds_test.to_parquet("outputs/preds_test_2017plus.parquet", index=False)
print("Saved predictions to outputs/")


# =========================
# 6) Save bundle (preprocess + booster_final)
# =========================
bundle = {
    "preprocess": preprocess,
    "booster": booster_final,
    "feature_columns": FEATURE_COLUMNS,
    "numeric_features": numeric_features,
    "categorical_features": categorical_features,
    "xgb_params": params,
    "best_iteration": int(best_iter),
}

joblib.dump(bundle, "outputs/xgb_yield_bundle.joblib")
print("Saved bundle: outputs/xgb_yield_bundle.joblib")

Train: (2803, 9) | Val: (346, 9) | Test: (187, 9)
Best iteration: 1796

=== TEST (2017+) ===
RMSE: 0.6897566700493316
MAE : 0.5710421754969632
R2  : 0.6853395372855011
Saved predictions to outputs/
Saved bundle: outputs/xgb_yield_bundle.joblib
