# Packages

In [18]:
import pandas as pd
import numpy as np

# Importation des donn√©es

In [19]:
df_stationary = pd.read_csv("df_stationary.csv", index_col="date")
df_stationary_test = pd.read_csv("df_stationary_test.csv", index_col="date")

In [20]:
df_stationary.columns

Index(['UNRATE', 'TB3MS', 'RPI', 'INDPRO', 'DPCERA3M086SBEA', 'S&P500',
       'BUSLOANS', 'CPIAUCSL', 'OILPRICEx', 'M2SL', 'USREC'],
      dtype='object')

In [21]:
df_stationary_test.columns

Index(['UNRATE', 'TB3MS', 'RPI', 'INDPRO', 'DPCERA3M086SBEA', 'S&P500',
       'BUSLOANS', 'CPIAUCSL', 'OILPRICEx', 'M2SL', 'USREC', 'UNRATE_lag12'],
      dtype='object')

# RIDGE

In [5]:
# ==========================================================
# üîπ Ridge Regression + Bagging (pseudo-OOS)
#    ‚Üí m√™me structure/artefacts que la version LinearRegression
# ==========================================================
import numpy as np
import pandas as pd
import pickle
from dateutil.relativedelta import relativedelta

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ---------- Param√®tres g√©n√©raux ----------
h = 12
min_train_n = 36           # ‚â• 3 ans avant de commencer √† pr√©voir
winsor_level = 0.01        # winsorisation (1er/99e percentiles)
norm_var = True            # normaliser ou non
target_col = "UNRATE"      # cible dans df_stationary

# S√©lection d'alpha : "cv" (CV 5-fold) OU une valeur float (ex: 1.0)
alpha_mode = "cv"
alpha_grid = np.logspace(-4, 4, 30)   # utilis√© si alpha_mode="cv"

# Fen√™tres d'√©valuation / test
eval_start = pd.Timestamp("1983-01-01")
eval_end   = pd.Timestamp("1989-12-31")
test_start = pd.Timestamp("1990-01-01")
test_end   = pd.Timestamp("2025-12-31")   # ajuste si besoin

# ---------- Bagging (bootstrap en blocs) ----------
use_bagging = True
B_boot = 30               # comme les auteurs
L_block = 12              # blocs annuels (12 mois)
rng = np.random.default_rng(123)  # seed bootstrap

# ---------- Fichiers de sortie ----------
RIDGE_PKL  = "ridge_regression.pkl"        # bundle (dict)
RIDGE_META = "ridge_regression_meta.csv"   # m√©ta r√©sum√©

# ---------- Pr√©paration df_stationary ----------
def _ensure_ms_index(df):
    """Force un index DatetimeIndex en d√©but de mois (MS)."""
    if "date" in df.columns:
        df = df.set_index("date")
    idx = pd.to_datetime(df.index)
    df = df.copy()
    df.index = idx.to_period("M").to_timestamp(how="start")
    return df.asfreq("MS")

# On part de df_stationary (toutes donn√©es : 1960‚Üí2025), d√©j√† charg√© en m√©moire
df_all = _ensure_ms_index(df_stationary).sort_index()

if target_col not in df_all.columns:
    raise ValueError(f"La colonne cible '{target_col}' est absente de df_stationary.")

y_all = df_all[target_col].astype(float)
X_all = df_all.drop(columns=[target_col]).astype(float)
features = list(X_all.columns)

print(f"‚úÖ Donn√©es pr√™tes : {df_all.index.min().date()} ‚Üí {df_all.index.max().date()} | n={len(df_all)} | freq=MS")
print(f"Features ({len(features)}): {features[:6]}{' ...' if len(features)>6 else ''}")

# ---------- Pr√©proc ----------
def fit_preproc(X, wins=0.01, do_norm=True):
    """Apprend winsor + normalisation sur TRAIN et renvoie (X_trans, prep)."""
    lower = X.quantile(wins)
    upper = X.quantile(1 - wins)
    Xw = X.clip(lower=lower, upper=upper, axis=1)
    if do_norm:
        mean = Xw.mean()
        std  = Xw.std().replace(0, 1)
        Xn   = (Xw - mean) / std
        prep = {"lower": lower, "upper": upper, "mean": mean, "std": std, "norm": True}
        return Xn, prep
    else:
        prep = {"lower": lower, "upper": upper, "mean": None, "std": None, "norm": False}
        return Xw, prep

def apply_preproc(X, prep):
    """Applique le pr√©proc appris (pas de fuite)."""
    Xp = X.clip(lower=prep["lower"], upper=prep["upper"], axis=1)
    if prep["norm"]:
        Xp = (Xp - prep["mean"]) / prep["std"].replace(0, 1)
    return Xp

# ---------- S√©lection d'alpha ----------
def select_alpha(X_tr_p, y_tr, mode="cv"):
    """Renvoie (alpha, cv_mae) si mode='cv', sinon (alpha, np.nan)."""
    if mode == "cv":
        model = Ridge(fit_intercept=True)
        grid = GridSearchCV(
            model,
            {"alpha": alpha_grid},
            scoring="neg_mean_absolute_error",
            cv=5,
            n_jobs=-1,
        )
        grid.fit(X_tr_p, y_tr.values)
        best_alpha = float(grid.best_estimator_.alpha)
        cv_mae = float(-grid.best_score_)
        return best_alpha, cv_mae
    else:
        # mode = valeur fixe (float)
        try:
            a = float(mode)
        except Exception as e:
            raise ValueError(f"alpha_mode doit √™tre 'cv' ou un float. Re√ßu: {mode}") from e
        return a, np.nan

# ---------- Bootstrap utils ----------
def block_bootstrap_rows(index, L, rng):
    """
    Moving-block bootstrap sur index (positions).
    Renvoie un array d'indices (longueur = n).
    """
    n = len(index)
    if n < 3:
        return np.arange(n)  # fallback
    L = max(2, min(int(L), n-1))
    nb = int(np.ceil(n / L))
    starts = rng.integers(0, n - L + 1, size=nb)
    ix = np.concatenate([np.arange(s, s+L) for s in starts])[:n]
    return ix

def bagged_predict_ridge(X_tr_raw, y_tr, x_fore_raw, prep, alpha, B, L, rng):
    """
    Bagging (moving-block bootstrap) pour Ridge :
      - pr√©proc fix√© sur TRAIN original (pas r√©-appris)
      - r√©√©chantillon par blocs (lignes) (X, y)
      - fit et pr√©diction h
      - renvoie (moyenne, distribution compl√®te, base_pred)
    """
    # Base fit (r√©f√©rence)
    X_tr_p = apply_preproc(X_tr_raw, prep)
    base = Ridge(alpha=alpha, fit_intercept=True)
    base.fit(X_tr_p, y_tr.values)
    yhat_base = float(base.predict(apply_preproc(x_fore_raw, prep))[0])

    preds = []
    for _ in range(B):
        ix = block_bootstrap_rows(X_tr_raw.index, L, rng)
        Xb = X_tr_raw.iloc[ix]
        yb = y_tr.iloc[ix]
        Xb_p = apply_preproc(Xb, prep)  # IMPORTANT: m√™me prep
        m = Ridge(alpha=alpha, fit_intercept=True)
        m.fit(Xb_p, yb.values)
        preds.append(float(m.predict(apply_preproc(x_fore_raw, prep))[0]))
    return float(np.mean(preds)), np.array(preds), yhat_base

# ---------- Boucle pseudo-OOS ----------
rows = []                 # (date, y_pred, y_true, y_pred_base, p05, p95)
models = []               # stockage dernier fit (optionnel)
preprocs = []             # stockage prep (optionnel)
train_ends = []           # dates de fin train (pour trace)
alpha_history = []        # alpha utilis√© par fen√™tre
cv_mae_history = []       # MAE CV (si mode="cv"), sinon NaN

last_t_end = y_all.index.max() - relativedelta(months=h)
last_model = None
last_fit_end = None

for t_end in y_all.index:
    if t_end > last_t_end:
        break

    y_tr = y_all.loc[:t_end]
    X_tr = X_all.loc[:t_end]
    if len(y_tr) < min_train_n:
        continue

    # Pr√©proc appris sur TRAIN courant
    X_tr_p, prep = fit_preproc(X_tr, wins=winsor_level, do_norm=norm_var)

    # Choix alpha (CV ou fixe)
    alpha, cv_mae = select_alpha(X_tr_p, y_tr, mode=alpha_mode)
    alpha_history.append(alpha)
    cv_mae_history.append(cv_mae)

    # Horizon cibl√©
    t_fore = t_end + relativedelta(months=h)
    if t_fore in y_all.index:
        x_fore_raw = X_all.loc[[t_fore]]

        if use_bagging:
            # (Option) reseed par mois : rng = np.random.default_rng(int(t_end.strftime("%Y%m")))
            yhat_h, dist, yhat_base = bagged_predict_ridge(
                X_tr_raw=X_tr, y_tr=y_tr, x_fore_raw=x_fore_raw,
                prep=prep, alpha=alpha, B=B_boot, L=L_block, rng=rng
            )
            y_p05 = float(np.percentile(dist, 5))
            y_p95 = float(np.percentile(dist, 95))
        else:
            model_tmp = Ridge(alpha=alpha, fit_intercept=True)
            model_tmp.fit(X_tr_p, y_tr.values)
            yhat_h = float(model_tmp.predict(apply_preproc(x_fore_raw, prep))[0])
            yhat_base = yhat_h
            y_p05, y_p95 = (np.nan, np.nan)

        rows.append((t_fore, yhat_h, float(y_all.loc[t_fore]), yhat_base, y_p05, y_p95))

    # trace / dernier mod√®le base (utile pour sauvegarde)
    last_model = Ridge(alpha=alpha, fit_intercept=True).fit(X_tr_p, y_tr.values)
    last_fit_end = t_end
    models.append(last_model)
    preprocs.append(prep)
    train_ends.append(t_end)

# ---------- DataFrame OOS ----------
if rows:
    df_oos = (
        pd.DataFrame(rows, columns=["date", "y_pred", "y_true", "y_pred_base", "y_pred_p05", "y_pred_p95"])
          .assign(date=lambda d: pd.to_datetime(d["date"]).dt.to_period("M").dt.to_timestamp(how="start"))
          .set_index("date").sort_index()
    )
else:
    df_oos = pd.DataFrame(columns=["y_pred", "y_true", "y_pred_base", "y_pred_p05", "y_pred_p95"])
    df_oos.index = pd.to_datetime(pd.Index([]))

print(f"\n‚úÖ Pseudo-OOS (Ridge) termin√© ‚Äî n pr√©visions = {len(df_oos)}")
print(df_oos.head(3))

# ---------- Scores ----------
def _scores(df):
    if len(df) == 0:
        return {"MAE": np.nan, "RMSE": np.nan, "R2": np.nan}
    mae  = mean_absolute_error(df["y_true"], df["y_pred"])
    rmse = np.sqrt(mean_squared_error(df["y_true"], df["y_pred"]))
    r2   = r2_score(df["y_true"], df["y_pred"]) if len(df) > 1 else np.nan
    return {"MAE": float(mae), "RMSE": float(rmse), "R2": float(r2)}

df_val  = df_oos.loc[eval_start:eval_end].copy()
df_test = df_oos.loc[test_start:test_end].copy()

sc_val  = _scores(df_val)
sc_test = _scores(df_test)

print(f"\nüìä Validation 83‚Äì89 ‚Äî n={len(df_val)} | MAE={sc_val['MAE']:.3f} | RMSE={sc_val['RMSE']:.3f} | R¬≤={sc_val['R2']:.3f}")
print(f"üìä Test 90‚Äì2025 ‚Äî n={len(df_test)} | MAE={sc_test['MAE']:.3f} | RMSE={sc_test['RMSE']:.3f} | R¬≤={sc_test['R2']:.3f}")

# (option) Comparaison bagging vs base
if "y_pred_base" in df_oos and df_oos["y_pred_base"].notna().any():
    mae_bag  = mean_absolute_error(df_oos["y_true"], df_oos["y_pred"])
    mae_base = mean_absolute_error(df_oos["y_true"], df_oos["y_pred_base"])
    print(f"‚û°Ô∏è  Gain bagging (ŒîMAE) = {mae_base - mae_bag:.3f}")

# ---------- Sauvegardes ----------
bundle = {
    "oos_predictions": df_oos.reset_index(),     # (date, y_pred, y_true, y_pred_base, y_pred_p05, y_pred_p95)
    "params": {
        "model": "Ridge",
        "horizon": h,
        "min_train_n": min_train_n,
        "winsor_level": winsor_level,
        "norm_var": norm_var,
        "features": features,
        "eval_window": (str(eval_start.date()), str(eval_end.date())),
        "test_window": (str(test_start.date()), str(test_end.date())),
        # ---- bagging ----
        "use_bagging": bool(use_bagging),
        "B_boot": int(B_boot),
        "L_block": int(L_block),
        # ---- alpha ----
        "alpha_mode": alpha_mode,
        "alpha_grid": list(alpha_grid) if alpha_mode == "cv" else None,
        "best_alpha_last": (alpha_history[-1] if len(alpha_history) else None),
    },
    "meta": {
        "trained_until": str(last_fit_end.date()) if last_fit_end is not None else None,
        "index_freq": "MS",
        "n_obs_all": int(len(df_all)),
        "n_forecasts": int(len(df_oos)),
        "alpha_history": alpha_history,
        "cv_mae_history": cv_mae_history,
    },
    "train_fit_dates": pd.to_datetime(pd.Index(train_ends)),

    # ‚úÖ Pour permutation_importance_pseudo_oos & SHAP
    "models":   models,     # liste des mod√®les Ridge (un par fen√™tre)
    "preprocs": preprocs,   # liste des pr√©proc (dict) align√©s aux mod√®les
}

# --- Sauvegarde du bundle complet ---
with open(RIDGE_PKL, "wb") as f:
    pickle.dump(bundle, f)

# --- Sauvegarde du r√©sum√© m√©ta s√©par√© (lisible rapidement) ---
pd.DataFrame([{
    "model": "Ridge",
    "horizon": h,
    "min_train_n": min_train_n,
    "winsor_level": winsor_level,
    "norm_var": norm_var,
    "use_bagging": bool(use_bagging),
    "B_boot": int(B_boot),
    "L_block": int(L_block),
    "alpha_mode": alpha_mode,
    "best_alpha_last": bundle["params"]["best_alpha_last"],
    "trained_until": bundle["meta"]["trained_until"],
    "n_forecasts": bundle["meta"]["n_forecasts"],
}]).to_csv(RIDGE_META, index=False)

print(f"\nüíæ Bundle sauvegard√© ‚Üí {RIDGE_PKL}")
print(f"üíæ M√©ta sauvegard√©e ‚Üí {RIDGE_META}")
print(f"üì¶ Contenu du bundle : {list(bundle.keys())}")

‚úÖ Donn√©es pr√™tes : 1960-01-01 ‚Üí 2025-08-01 | n=788 | freq=MS
Features (10): ['TB3MS', 'RPI', 'INDPRO', 'DPCERA3M086SBEA', 'S&P500', 'BUSLOANS'] ...

‚úÖ Pseudo-OOS (Ridge) termin√© ‚Äî n pr√©visions = 741
              y_pred  y_true  y_pred_base  y_pred_p05  y_pred_p95
date                                                             
1963-12-01  0.093785     0.0    -0.181263   -0.455216    1.220200
1964-01-01  0.348080    -0.1    -0.071449   -0.302035    1.165286
1964-02-01  0.390935    -0.5     0.058525   -0.508924    1.117637

üìä Validation 83‚Äì89 ‚Äî n=84 | MAE=0.819 | RMSE=1.025 | R¬≤=-0.346
üìä Test 90‚Äì2025 ‚Äî n=428 | MAE=0.821 | RMSE=1.464 | R¬≤=0.080
‚û°Ô∏è  Gain bagging (ŒîMAE) = -0.012

üíæ Bundle sauvegard√© ‚Üí ridge_regression.pkl
üíæ M√©ta sauvegard√©e ‚Üí ridge_regression_meta.csv
üì¶ Contenu du bundle : ['oos_predictions', 'params', 'meta', 'train_fit_dates', 'models', 'preprocs']


# LightGBM

In [6]:
# ==========================================================
# LightGBM + Bagging (pseudo-OOS, h=12) ‚Äî structure "comme Ridge"
# - Refit annuel, retune hyperparams tous les 36 mois (d√®s 1983)
# - hv-block CV (5 folds, gap=12), scoring=MAE
# - Winsorisation 1%/99% + normalisation (apprises sur TRAIN)
# - Bagging : n_boot bootstrap (proportions/indices), moyenne des pr√©dictions
# - Bundle complet : oos_predictions + models + preprocs + train_fit_dates
# ==========================================================
import numpy as np
import pandas as pd
import pickle
from dateutil.relativedelta import relativedelta
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from lightgbm import LGBMRegressor

# ---------- Param√®tres g√©n√©raux ----------
h = 12
min_train_n = 36
winsor_level = 0.01
norm_var = True
target_col = "UNRATE"

# Fen√™tres d‚Äô√©val / test
eval_start = pd.Timestamp("1983-01-01")
eval_end   = pd.Timestamp("1989-12-31")
test_start = pd.Timestamp("1990-01-01")
test_end   = pd.Timestamp("2025-12-31")

# ---------- Refit/Retune ----------
refit_every_months  = 12     # refit chaque 12 mois
retune_every_months = 36     # retune hyperparams chaque 36 mois

# ---------- Bagging ----------
use_bagging = True
n_boot = 30
bootstrap_proportion = 1.0
rng = np.random.default_rng(12345)

# ---------- Fichiers de sortie ----------
LGBM_BUNDLE     = "lightgbm_regression.pkl"
LGBM_META       = "lightgbm_regression_meta.csv"
LGBM_LAST_PKL   = "LGBM_last_trained_model.pkl"
LGBM_LAST_META  = "LGBM_last_trained_model_meta.csv"

In [7]:
# ---------- Pr√©paration df_stationary ----------
def _ensure_ms_index(df: pd.DataFrame) -> pd.DataFrame:
    """Index mensuel (MS). Si 'date' existe, on l'utilise comme index."""
    df = df.copy()
    if "date" in df.columns:
        df = df.set_index("date")
    idx = pd.to_datetime(df.index)
    df.index = idx.to_period("M").to_timestamp(how="start")
    return df.asfreq("MS")

# ‚ö†Ô∏è On suppose df_stationary dispo
df_all = _ensure_ms_index(df_stationary).sort_index()

if target_col not in df_all.columns:
    raise ValueError(f"La cible '{target_col}' est absente de df_stationary.")

y_all = df_all[target_col].astype(float)
X_all = df_all.drop(columns=[target_col]).astype(float)

# LightGBM : √©viter espaces dans noms de colonnes
X_all.columns = [str(c).replace(" ", "_") for c in X_all.columns]
features = list(X_all.columns)

print(f"‚úÖ Donn√©es pr√™tes : {df_all.index.min().date()} ‚Üí {df_all.index.max().date()} | n={len(df_all)} | freq=MS")
print(f"Features ({len(features)}): {features[:6]}{' ...' if len(features)>6 else ''}")

# ---------- Pr√©proc ----------
def fit_preproc(X: pd.DataFrame, wins=0.01, do_norm=True):
    lower = X.quantile(wins)
    upper = X.quantile(1 - wins)
    Xw = X.clip(lower=lower, upper=upper, axis=1)
    if do_norm:
        mean = Xw.mean()
        std  = Xw.std().replace(0, 1)
        Xn   = (Xw - mean) / std
        prep = {"lower": lower, "upper": upper, "mean": mean, "std": std, "norm": True}
        return Xn, prep
    else:
        prep = {"lower": lower, "upper": upper, "mean": None, "std": None, "norm": False}
        return Xw, prep

def apply_preproc(X: pd.DataFrame, prep: dict):
    Xp = X.clip(lower=prep["lower"], upper=prep["upper"], axis=1)
    if prep["norm"]:
        Xp = (Xp - prep["mean"]) / prep["std"].replace(0, 1)
    return Xp

‚úÖ Donn√©es pr√™tes : 1960-01-01 ‚Üí 2025-08-01 | n=788 | freq=MS
Features (10): ['TB3MS', 'RPI', 'INDPRO', 'DPCERA3M086SBEA', 'S&P500', 'BUSLOANS'] ...


In [8]:
# ---------- hv-block CV ----------
class HVBlockCV:
    def __init__(self, n_splits=5, gap=12):
        self.n_splits = n_splits
        self.gap = gap
    def split(self, X, y=None, groups=None):
        n = len(X)
        fold_sizes = np.full(self.n_splits, n // self.n_splits, dtype=int)
        fold_sizes[: n % self.n_splits] += 1
        idx = np.arange(n)
        cur = 0
        for fs in fold_sizes:
            start, stop = cur, cur + fs
            test_idx = idx[start:stop]
            train_mask = np.ones(n, dtype=bool)
            left = max(0, start - self.gap)
            right = min(n, stop + self.gap)
            train_mask[left:right] = False
            train_idx = idx[train_mask]
            cur = stop
            if len(train_idx) == 0 or len(test_idx) == 0:
                continue
            yield train_idx, test_idx
    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

In [9]:
# ---------- Grille LightGBM (stabilit√©) ----------
param_dist = {
    "subsample":        [0.05,.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0],
    "colsample_bytree": [.2,.3,.4,.5,.6,.7,1.0],
    "num_leaves":       [2,3,4,5,8,10,20,40,70,100],
    "n_estimators":     [5,10,20,30,40,50,75,100],
    "max_depth":        [1,2,3,5,8,15,-1],          # -1 = illimit√©
    "reg_alpha":        [0, .1, 1, 2, 7, 10, 50, 100],
    "reg_lambda":       [0, .1, 1, 10, 20, 50, 100],
    "min_child_samples":[5,10,15],
    "min_split_gain":   [0.0, 0.01, 0.05],
}

In [10]:
def tune_lgbm(X_tr_p: pd.DataFrame, y_tr: np.ndarray, seed=12345):
    cv = HVBlockCV(n_splits=5, gap=12)
    base = LGBMRegressor(
        boosting_type="gbdt",
        objective="regression",
        importance_type="gain",
        random_state=seed,
        n_jobs=1,    # √©viter sur-parall√©lisation quand RandomizedSearchCV utilise n_jobs=-1
        verbose=-1
    )
    rs = RandomizedSearchCV(
        estimator=base,
        param_distributions=param_dist,
        n_iter=100,
        scoring="neg_mean_absolute_error",
        cv=cv,
        n_jobs=-1,
        random_state=seed,
        refit=True,
        verbose=0
    )
    rs.fit(X_tr_p, y_tr)
    return rs.best_params_, float(-rs.best_score_), rs.best_estimator_

In [11]:
# ---------- Bootstrap utils ----------
def bootstrap_indices(n, proportion=1.0, seed=None):
    m = int(round(n * proportion))
    rng_local = np.random.default_rng(seed)
    return rng_local.integers(0, n, size=m, endpoint=False)

def bagged_predict_lgbm(X_tr_raw, y_tr, x_fore_raw, prep, best_params, B, proportion, seed0):
    """
    Bagging LightGBM :
      - pr√©proc fixe (appris sur TRAIN original)
      - bootstrap simple d'indices (proportion)
      - fit LGBM(**best_params) et pr√©diction
      - retourne la moyenne + distribution + pr√©diction base (fit sur tout TRAIN)
    """
    # Base (r√©f√©rence) : fit sur tout TRAIN pr√©trait√©
    X_tr_p = apply_preproc(X_tr_raw, prep)
    base = LGBMRegressor(
        boosting_type="gbdt",
        objective="regression",
        importance_type="gain",
        n_jobs=1,
        random_state=seed0,
        verbose=-1,
        **best_params
    )
    base.fit(X_tr_p, y_tr.values)
    yhat_base = float(base.predict(apply_preproc(x_fore_raw, prep))[0])

    preds = []
    n = len(X_tr_raw)
    for b in range(B):
        ix = bootstrap_indices(n, proportion=proportion, seed=seed0 + b)
        Xb = X_tr_raw.iloc[ix]
        yb = y_tr.iloc[ix]
        Xb_p, _ = fit_preproc(Xb, wins=winsor_level, do_norm=norm_var)  # pr√©proc appris sur bootstrap
        m = LGBMRegressor(
            boosting_type="gbdt",
            objective="regression",
            importance_type="gain",
            n_jobs=1,
            random_state=seed0 + b,
            verbose=-1,
            **best_params
        )
        m.fit(Xb_p, yb.values)
        preds.append(float(m.predict(apply_preproc(x_fore_raw, _))[0]))  # appliquer le prep du bootstrap (_)

    preds = np.array(preds)
    return float(np.mean(preds)), preds, yhat_base

In [12]:
# ---------- Boucle pseudo-OOS ----------
rows = []                 # (date, y_pred, y_true, y_pred_base, p05, p95)
models = []               # mod√®les "base" LGBM par refit (pour permutation/SHAP)
preprocs = []             # pr√©procs align√©s
train_ends = []           # dates de refit
cv_mae_history = []       # historique MAE CV lors des retunes
best_params_hist = []     # historique des params

last_fit_end = None
last_t_end   = y_all.index.max() - relativedelta(months=h)
last_refit_t = None
last_tune_t  = None

best_params = {}          # params courants (remplis au 1er retune)
base_model = None         # s√©curit√© si use_bagging=False
boot_seed = 12345         # reseed √† chaque refit
seed0 = 12345

for t_end in y_all.index:
    if t_end > last_t_end:
        break

    y_tr = y_all.loc[:t_end]
    X_tr = X_all.loc[:t_end]
    if len(y_tr) < min_train_n:
        continue

    # cadence refit
    if last_refit_t is None:
        need_refit = True
    else:
        months_since_refit = (t_end.year - last_refit_t.year)*12 + (t_end.month - last_refit_t.month)
        need_refit = months_since_refit >= refit_every_months

    # cadence retune
    need_tune = False
    if t_end >= eval_start:
        if last_tune_t is None:
            need_tune = True
        else:
            months_since_tune = (t_end.year - last_tune_t.year)*12 + (t_end.month - last_tune_t.month)
            need_tune = months_since_tune >= retune_every_months

    # Pr√©proc global (pour tuning/refit)
    X_tr_p_global, prep_global = fit_preproc(X_tr, wins=winsor_level, do_norm=norm_var)

    if need_tune:
        best_params, best_cv_mae, _ = tune_lgbm(X_tr_p_global, y_tr.values, seed=seed0)
        last_tune_t = t_end
        cv_mae_history.append(best_cv_mae)
        best_params_hist.append(best_params.copy())
    else:
        cv_mae_history.append(np.nan)
        best_params_hist.append(best_params.copy() if best_params else {})

    # Valeurs s√ªres si aucun retune n'a encore eu lieu
    if not best_params:
        best_params = dict(
            subsample=0.7, colsample_bytree=0.7, num_leaves=31,
            n_estimators=100, max_depth=-1, reg_alpha=0.0, reg_lambda=0.0,
            min_child_samples=10, min_split_gain=0.0
        )

    if need_refit:
        # Mod√®le "base" stock√© pour permutation/SHAP
        base_model = LGBMRegressor(
            boosting_type="gbdt",
            objective="regression",
            importance_type="gain",
            n_jobs=1,
            random_state=seed0,
            verbose=-1,
            **best_params
        )
        base_model.fit(X_tr_p_global, y_tr.values)
        models.append(base_model)
        preprocs.append(prep_global)

        train_ends.append(t_end)
        last_refit_t = t_end
        last_fit_end = t_end

        # reseed bagging √† chaque refit
        boot_seed += 9973

    # Pr√©vision h=12
    t_fore = t_end + relativedelta(months=h)
    if t_fore in y_all.index:
        x_fore_raw = X_all.loc[[t_fore]]

        if use_bagging:
            yhat, dist, yhat_base = bagged_predict_lgbm(
                X_tr_raw=X_tr, y_tr=y_tr, x_fore_raw=x_fore_raw,
                prep=prep_global, best_params=best_params,
                B=n_boot, proportion=bootstrap_proportion, seed0=boot_seed
            )
            y_p05 = float(np.percentile(dist, 5))
            y_p95 = float(np.percentile(dist, 95))
        else:
            # sans bagging : utiliser / garantir un base_model
            if base_model is None:
                base_model = LGBMRegressor(
                    boosting_type="gbdt",
                    objective="regression",
                    importance_type="gain",
                    n_jobs=1,
                    random_state=seed0,
                    verbose=-1,
                    **best_params
                ).fit(X_tr_p_global, y_tr.values)
            yhat = float(base_model.predict(apply_preproc(x_fore_raw, prep_global))[0])
            yhat_base = yhat
            y_p05, y_p95 = (np.nan, np.nan)

        rows.append((t_fore, yhat, float(y_all.loc[t_fore]), yhat_base, y_p05, y_p95))

# ---------- DataFrame OOS ----------
if rows:
    df_oos = pd.DataFrame(
        rows,
        columns=["date", "y_pred", "y_true", "y_pred_base", "y_pred_p05", "y_pred_p95"]
    )
    # ‚úÖ Conversion date ‚Üí d√©but de mois (MS)
    df_oos["date"] = pd.to_datetime(df_oos["date"]).dt.to_period("M").dt.to_timestamp(how="start")
    df_oos = df_oos.set_index("date").sort_index()
else:
    df_oos = pd.DataFrame(columns=["y_pred", "y_true", "y_pred_base", "y_pred_p05", "y_pred_p95"])
    df_oos.index = pd.to_datetime(pd.Index([]))

# ---------- Scores ----------
def _scores(df):
    if len(df) == 0:
        return {"MAE": np.nan, "RMSE": np.nan, "R2": np.nan}
    mae  = mean_absolute_error(df["y_true"], df["y_pred"])
    rmse = np.sqrt(mean_squared_error(df["y_true"], df["y_pred"]))
    ssr  = np.sum((df["y_true"] - df["y_pred"])**2)
    sst  = np.sum((df["y_true"] - df["y_true"].mean())**2)
    r2   = 1 - ssr/sst if sst > 0 else np.nan
    return {"MAE": float(mae), "RMSE": float(rmse), "R2": float(r2)}

df_val  = df_oos.loc[eval_start:eval_end].copy()
df_test = df_oos.loc[test_start:test_end].copy()

sc_val  = _scores(df_val)
sc_test = _scores(df_test)

print(f"\nüìä Validation 83‚Äì89 ‚Äî n={len(df_val)} | MAE={sc_val['MAE']:.3f} | RMSE={sc_val['RMSE']:.3f} | R¬≤={sc_val['R2']:.3f}")
print(f"üìä Test 90‚Äì{test_end.year} ‚Äî n={len(df_test)} | MAE={sc_test['MAE']:.3f} | RMSE={sc_test['RMSE']:.3f} | R¬≤={sc_test['R2']:.3f}")


üìä Validation 83‚Äì89 ‚Äî n=84 | MAE=0.788 | RMSE=1.040 | R¬≤=-0.385
üìä Test 90‚Äì2025 ‚Äî n=428 | MAE=0.777 | RMSE=1.379 | R¬≤=0.183


In [13]:
# ---------- Sauvegardes ----------
bundle = {
    "oos_predictions": df_oos.reset_index(),
    "params": {
        "model": "LightGBM + Bagging",
        "horizon": h,
        "min_train_n": min_train_n,
        "winsor_level": winsor_level,
        "norm_var": norm_var,
        "features": features,
        "eval_window": (str(eval_start.date()), str(eval_end.date())),
        "test_window": (str(test_start.date()), str(test_end.date())),
        # plan refit/retune
        "refit_every_months": refit_every_months,
        "retune_every_months": retune_every_months,
        # search
        "hyper_search": "RandomizedSearchCV (100 iters) + hv-block CV (5 folds, gap=12), scoring=MAE",
        "best_params_last": best_params.copy(),
        # bagging
        "use_bagging": bool(use_bagging),
        "n_boot": int(n_boot),
        "bootstrap_proportion": float(bootstrap_proportion),
    },
    "meta": {
        "trained_until": str(last_fit_end.date()) if last_fit_end is not None else None,
        "index_freq": "MS",
        "n_obs_all": int(len(df_all)),
        "n_forecasts": int(len(df_oos)),
        "cv_mae_history": cv_mae_history,
        "best_params_history": best_params_hist
    },
    "train_fit_dates": pd.to_datetime(pd.Index(train_ends)),

    # ‚úÖ Pour permutation_importance_pseudo_oos & SHAP
    "models":   models,     # liste des mod√®les "base" LGBM (un par refit)
    "preprocs": preprocs,   # liste des pr√©proc (dict) align√©s aux mod√®les
}

with open(LGBM_BUNDLE, "wb") as f:
    pickle.dump(bundle, f)

pd.DataFrame([{
    "model": "LightGBM+Bagging",
    "horizon": h,
    "min_train_n": min_train_n,
    "winsor_level": winsor_level,
    "norm_var": norm_var,
    "refit_every_months": refit_every_months,
    "retune_every_months": retune_every_months,
    "use_bagging": bool(use_bagging),
    "n_boot": int(n_boot),
    "bootstrap_proportion": float(bootstrap_proportion),
    "trained_until": bundle["meta"]["trained_until"],
    "n_forecasts": bundle["meta"]["n_forecasts"]
}]).to_csv(LGBM_META, index=False)

# Artefact dernier ensemble (optionnel : pour audit)
lgbm_artifact = {
    "trained_until": bundle["meta"]["trained_until"],
    "horizon": h,
    "features": features,
    "n_models_base": len(models),
    "best_params_last": best_params.copy(),
}
with open(LGBM_LAST_PKL, "wb") as f:
    pickle.dump(lgbm_artifact, f)
pd.DataFrame([{
    "trained_until": lgbm_artifact["trained_until"],
    "n_features": len(features),
    "horizon": h,
    "n_models_base": lgbm_artifact["n_models_base"]
}]).to_csv(LGBM_LAST_META, index=False)

print(f"\nüíæ Bundle OOS sauvegard√© ‚Üí {LGBM_BUNDLE}")
print(f"üíæ M√©ta bundle       ‚Üí {LGBM_META}")
print(f"üíæ Dernier mod√®le    ‚Üí {LGBM_LAST_PKL}")
print(f"üíæ M√©ta dernier fit  ‚Üí {LGBM_LAST_META}")


üíæ Bundle OOS sauvegard√© ‚Üí lightgbm_regression.pkl
üíæ M√©ta bundle       ‚Üí lightgbm_regression_meta.csv
üíæ Dernier mod√®le    ‚Üí LGBM_last_trained_model.pkl
üíæ M√©ta dernier fit  ‚Üí LGBM_last_trained_model_meta.csv


# LightGBM avec taux de ch√¥mage en retards

In [14]:
# ==========================================================
# üîπ LightGBM + Bagging en blocs (pseudo-OOS, h=12)
#    ‚Üí variante AVEC RETARDS (ajout y_{t-h})
#    ‚Üí artefacts tagg√©s: with_<target>_lags_h<h>
# ==========================================================
import numpy as np
import pandas as pd
import pickle
from dateutil.relativedelta import relativedelta

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from lightgbm import LGBMRegressor

# ---------- Param√®tres g√©n√©raux ----------
h = 12
min_train_n = 36
winsor_level = 0.01
norm_var = True
target_col = "UNRATE"

# Fen√™tres d‚Äô√©val / test
eval_start = pd.Timestamp("1983-01-01")
eval_end   = pd.Timestamp("1989-12-31")
test_start = pd.Timestamp("1990-01-01")
test_end   = pd.Timestamp("2025-12-31")

# ---------- Refit/Retune ----------
refit_every_months  = 12     # refit chaque 12 mois
retune_every_months = 36     # retune hyperparams chaque 36 mois

# ---------- Bagging (moving-block bootstrap, comme Ridge) ----------
use_bagging = True
B_boot = 30                 # nb. de bootstraps
L_block = 12                # longueur de bloc (mois)
rng = np.random.default_rng(12345)

# ---------- Fichiers de sortie (VARIANTE AVEC RETARDS) ----------
NAME_TAG = f"with_{target_col}_lags_h{h}"   # ex: with_UNRATE_lags_h12
LGBM_LAGS_BUNDLE    = f"lightgbm_regression__{NAME_TAG}.pkl"
LGBM_LAGS_META      = f"lightgbm_regression_meta__{NAME_TAG}.csv"
LGBM_LAGS_LAST_PKL  = f"LGBM_last_trained_model__{NAME_TAG}.pkl"
LGBM_LAGS_LAST_META = f"LGBM_last_trained_model_meta__{NAME_TAG}.csv"

# ---------- Pr√©paration df_stationary ----------
def _ensure_ms_index(df: pd.DataFrame) -> pd.DataFrame:
    """Index mensuel (MS). Si 'date' existe, on l'utilise comme index."""
    df = df.copy()
    if "date" in df.columns:
        df = df.set_index("date")
    idx = pd.to_datetime(df.index)
    df.index = idx.to_period("M").to_timestamp(how="start")
    return df.asfreq("MS")

# ‚ö†Ô∏è On suppose df_stationary dispo
df_all = _ensure_ms_index(df_stationary).sort_index()

if target_col not in df_all.columns:
    raise ValueError(f"La cible '{target_col}' est absente de df_stationary.")

y_all = df_all[target_col].astype(float)
X_all = df_all.drop(columns=[target_col]).astype(float)

# --- Ajout du lag h (=12) de la cible comme variable explicative ---
lag_feat_name = f"{target_col}_lag{h}"   # ex: 'UNRATE_lag12'
X_all[lag_feat_name] = y_all.shift(h)

# LightGBM : √©viter espaces dans noms de colonnes
X_all.columns = [str(c).replace(" ", "_") for c in X_all.columns]
features = list(X_all.columns)

print(f"Ajout feature: {lag_feat_name} (y_(t-h)) ‚Üí OK")
print(f"‚úÖ Donn√©es pr√™tes : {df_all.index.min().date()} ‚Üí {df_all.index.max().date()} | n={len(df_all)} | freq=MS")
print(f"Features ({len(features)}): {features[:6]}{' ...' if len(features)>6 else ''}")

# ---------- Pr√©proc ----------
def fit_preproc(X: pd.DataFrame, wins=0.01, do_norm=True):
    lower = X.quantile(wins)
    upper = X.quantile(1 - wins)
    Xw = X.clip(lower=lower, upper=upper, axis=1)
    if do_norm:
        mean = Xw.mean()
        std  = Xw.std().replace(0, 1)
        Xn   = (Xw - mean) / std
        prep = {"lower": lower, "upper": upper, "mean": mean, "std": std, "norm": True}
        return Xn, prep
    else:
        prep = {"lower": lower, "upper": upper, "mean": None, "std": None, "norm": False}
        return Xw, prep

def apply_preproc(X: pd.DataFrame, prep: dict):
    Xp = X.clip(lower=prep["lower"], upper=prep["upper"], axis=1)
    if prep["norm"]:
        Xp = (Xp - prep["mean"]) / prep["std"].replace(0, 1)
    return Xp

# ---------- hv-block CV ----------
class HVBlockCV:
    def __init__(self, n_splits=5, gap=12):
        self.n_splits = n_splits
        self.gap = gap
    def split(self, X, y=None, groups=None):
        n = len(X)
        fold_sizes = np.full(self.n_splits, n // self.n_splits, dtype=int)
        fold_sizes[: n % self.n_splits] += 1
        idx = np.arange(n)
        cur = 0
        for fs in fold_sizes:
            start, stop = cur, cur + fs
            test_idx = idx[start:stop]
            train_mask = np.ones(n, dtype=bool)
            left = max(0, start - self.gap)
            right = min(n, stop + self.gap)
            train_mask[left:right] = False
            train_idx = idx[train_mask]
            cur = stop
            if len(train_idx) == 0 or len(test_idx) == 0:
                continue
            yield train_idx, test_idx
    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

# ---------- Grille LightGBM ----------
param_dist = {
    "subsample":        [0.05,.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0],
    "colsample_bytree": [.2,.3,.4,.5,.6,.7,1.0],
    "num_leaves":       [2,3,4,5,8,10,20,40,70,100],
    "n_estimators":     [5,10,20,30,40,50,75,100],
    "max_depth":        [1,2,3,5,8,15,-1],
    "reg_alpha":        [0, .1, 1, 2, 7, 10, 50, 100],
    "reg_lambda":       [0, .1, 1, 10, 20, 50, 100],
    "min_child_samples":[5,10,15],
    "min_split_gain":   [0.0, 0.01, 0.05],
}

def tune_lgbm(X_tr_p: pd.DataFrame, y_tr: np.ndarray, seed=12345):
    cv = HVBlockCV(n_splits=5, gap=12)
    base = LGBMRegressor(
        boosting_type="gbdt",
        objective="regression",
        importance_type="gain",
        random_state=seed,
        n_jobs=1,
        verbose=-1
    )
    rs = RandomizedSearchCV(
        estimator=base,
        param_distributions=param_dist,
        n_iter=100,
        scoring="neg_mean_absolute_error",
        cv=cv,
        n_jobs=-1,
        random_state=seed,
        refit=True,
        verbose=0
    )
    rs.fit(X_tr_p, y_tr)
    return rs.best_params_, float(-rs.best_score_), rs.best_estimator_

# ---------- Bootstrap utils (moving-block comme Ridge) ----------
def block_bootstrap_rows(index, L, rng):
    n = len(index)
    if n < 3:
        return np.arange(n)
    L = max(2, min(int(L), n-1))
    nb = int(np.ceil(n / L))
    starts = rng.integers(0, n - L + 1, size=nb)
    ix = np.concatenate([np.arange(s, s+L) for s in starts])[:n]
    return ix

def bagged_predict_lgbm(X_tr_raw, y_tr, x_fore_raw, prep, best_params, B, L, rng):
    """
    Bagging en blocs pour LightGBM (pr√©proc fix√© comme Ridge).
    """
    # Base fit (r√©f√©rence) sur TRAIN complet pr√©trait√©
    X_tr_p = apply_preproc(X_tr_raw, prep)
    base = LGBMRegressor(
        boosting_type="gbdt",
        objective="regression",
        importance_type="gain",
        n_jobs=1,
        random_state=123,
        verbose=-1,
        **best_params
    )
    base.fit(X_tr_p, y_tr.values)
    yhat_base = float(base.predict(apply_preproc(x_fore_raw, prep))[0])

    preds = []
    for b in range(B):
        ix = block_bootstrap_rows(X_tr_raw.index, L, rng)
        Xb = X_tr_raw.iloc[ix]
        yb = y_tr.iloc[ix]
        Xb_p = apply_preproc(Xb, prep)   # ‚ö†Ô∏è m√™me prep
        m = LGBMRegressor(
            boosting_type="gbdt",
            objective="regression",
            importance_type="gain",
            n_jobs=1,
            random_state=123 + b,
            verbose=-1,
            **best_params
        )
        m.fit(Xb_p, yb.values)
        preds.append(float(m.predict(apply_preproc(x_fore_raw, prep))[0]))
    return float(np.mean(preds)), np.array(preds), yhat_base

# ---------- Boucle pseudo-OOS ----------
def _months_between(a: pd.Timestamp, b: pd.Timestamp) -> int:
    return (b.year - a.year) * 12 + (b.month - a.month)

rows = []
models = []               # mod√®les "base" LGBM par refit (pour permutation/SHAP)
preprocs = []             # pr√©procs align√©s
train_ends = []           # dates de refit
cv_mae_history = []       # historique MAE CV lors des retunes
best_params_hist = []     # historique des params

last_fit_end = None
last_t_end   = y_all.index.max() - relativedelta(months=h)
last_refit_t = None
last_tune_t  = None

best_params = {}          # params courants (remplis au 1er retune)
base_model = None

for t_end in y_all.index:
    if t_end > last_t_end:
        break

    y_tr = y_all.loc[:t_end]
    X_tr = X_all.loc[:t_end]
    if len(y_tr) < min_train_n:
        continue

    # cadence refit / retune
    need_refit = (last_refit_t is None) or (_months_between(last_refit_t, t_end) >= refit_every_months)
    need_tune  = (t_end >= eval_start) and (last_tune_t is None or _months_between(last_tune_t, t_end) >= retune_every_months)

    # Pr√©proc global (fix√© pour ce t_end)
    X_tr_p_global, prep_global = fit_preproc(X_tr, wins=winsor_level, do_norm=norm_var)

    # Tuning si n√©cessaire
    if need_tune:
        best_params, best_cv_mae, _ = tune_lgbm(X_tr_p_global, y_tr.values, seed=12345)
        last_tune_t = t_end
        cv_mae_history.append(best_cv_mae)
        best_params_hist.append(best_params.copy())
    else:
        cv_mae_history.append(np.nan)
        best_params_hist.append(best_params.copy() if best_params else {})

    # Valeurs s√ªres si aucun retune n'a encore eu lieu
    if not best_params:
        best_params = dict(
            subsample=0.7, colsample_bytree=0.7, num_leaves=31,
            n_estimators=100, max_depth=-1, reg_alpha=0.0, reg_lambda=0.0,
            min_child_samples=10, min_split_gain=0.0
        )

    # Refit si n√©cessaire (stockage du mod√®le base + prep align√©s)
    if need_refit:
        base_model = LGBMRegressor(
            boosting_type="gbdt",
            objective="regression",
            importance_type="gain",
            n_jobs=1,
            random_state=12345,
            verbose=-1,
            **best_params
        )
        base_model.fit(X_tr_p_global, y_tr.values)
        models.append(base_model)
        preprocs.append(prep_global)
        train_ends.append(t_end)
        last_refit_t = t_end
        last_fit_end = t_end

    # Pr√©vision h=12
    t_fore = t_end + relativedelta(months=h)
    if t_fore in y_all.index:
        x_fore_raw = X_all.loc[[t_fore]]
        if use_bagging:
            yhat, dist, yhat_base = bagged_predict_lgbm(
                X_tr_raw=X_tr, y_tr=y_tr, x_fore_raw=x_fore_raw,
                prep=prep_global, best_params=best_params,
                B=B_boot, L=L_block, rng=rng
            )
            y_p05 = float(np.percentile(dist, 5))
            y_p95 = float(np.percentile(dist, 95))
        else:
            if base_model is None:
                base_model = LGBMRegressor(
                    boosting_type="gbdt",
                    objective="regression",
                    importance_type="gain",
                    n_jobs=1,
                    random_state=12345,
                    verbose=-1,
                    **best_params
                ).fit(X_tr_p_global, y_tr.values)
            yhat = float(base_model.predict(apply_preproc(x_fore_raw, prep_global))[0])
            yhat_base = yhat
            y_p05, y_p95 = (np.nan, np.nan)

        rows.append((t_fore, yhat, float(y_all.loc[t_fore]), yhat_base, y_p05, y_p95))

# ---------- DataFrame OOS ----------
if rows:
    df_oos = (
        pd.DataFrame(rows, columns=["date", "y_pred", "y_true", "y_pred_base", "y_pred_p05", "y_pred_p95"])
          .assign(date=lambda d: pd.to_datetime(d["date"]).dt.to_period("M").dt.to_timestamp(how="start"))
          .set_index("date").sort_index()
    )
else:
    df_oos = pd.DataFrame(columns=["y_pred","y_true","y_pred_base","y_pred_p05","y_pred_p95"])
    df_oos.index = pd.to_datetime(pd.Index([]))

print(f"\n‚úÖ Pseudo-OOS (LightGBM, {NAME_TAG}) ‚Äî n pr√©visions = {len(df_oos)}")

# ---------- Scores ----------
def _scores(df):
    if len(df) == 0:
        return {"MAE": np.nan, "RMSE": np.nan, "R2": np.nan}
    mae  = mean_absolute_error(df["y_true"], df["y_pred"])
    rmse = np.sqrt(mean_squared_error(df["y_true"], df["y_pred"]))
    r2   = r2_score(df["y_true"], df["y_pred"]) if len(df) > 1 else np.nan
    return {"MAE": float(mae), "RMSE": float(rmse), "R2": float(r2)}

df_val  = df_oos.loc[eval_start:eval_end].copy()
df_test = df_oos.loc[test_start:test_end].copy()
sc_val  = _scores(df_val)
sc_test = _scores(df_test)

print(f"\nüìä Validation 83‚Äì89 ‚Äî n={len(df_val)} | MAE={sc_val['MAE']:.3f} | RMSE={sc_val['RMSE']:.3f} | R¬≤={sc_val['R2']:.3f}")
print(f"üìä Test 90‚Äì{test_end.year} ‚Äî n={len(df_test)} | MAE={sc_test['MAE']:.3f} | RMSE={sc_test['RMSE']:.3f} | R¬≤={sc_test['R2']:.3f}")

# ---------- Sauvegardes ----------
bundle = {
    "oos_predictions": df_oos.reset_index(),
    "params": {
        "model": f"LightGBM + Bagging ({NAME_TAG})",
        "horizon": h,
        "min_train_n": min_train_n,
        "winsor_level": winsor_level,
        "norm_var": norm_var,
        "features": features,
        "eval_window": (str(eval_start.date()), str(eval_end.date())),
        "test_window": (str(test_start.date()), str(test_end.date())),
        # plan refit/retune
        "refit_every_months": int(refit_every_months),
        "retune_every_months": int(retune_every_months),
        # search
        "hyper_search": "RandomizedSearchCV (100 iters) + hv-block CV (5 folds, gap=12), scoring=MAE",
        "best_params_last": best_params.copy(),
        # bagging (m√™mes noms que Ridge)
        "use_bagging": bool(use_bagging),
        "B_boot": int(B_boot),
        "L_block": int(L_block),
    },
    "meta": {
        "trained_until": str(last_fit_end.date()) if last_fit_end is not None else None,
        "index_freq": "MS",
        "n_obs_all": int(len(df_all)),
        "n_forecasts": int(len(df_oos)),
        "cv_mae_history": cv_mae_history,
        "best_params_history": best_params_hist
    },
    "train_fit_dates": pd.to_datetime(pd.Index(train_ends)),

    # ‚úÖ Pour permutation_importance_pseudo_oos & SHAP
    "models":   models,
    "preprocs": preprocs,
}

with open(LGBM_LAGS_BUNDLE, "wb") as f:
    pickle.dump(bundle, f)

pd.DataFrame([{
    "model": f"LightGBM+Bagging ({NAME_TAG})",
    "horizon": h,
    "min_train_n": min_train_n,
    "winsor_level": winsor_level,
    "norm_var": norm_var,
    "refit_every_months": int(refit_every_months),
    "retune_every_months": int(retune_every_months),
    "use_bagging": bool(use_bagging),
    "B_boot": int(B_boot),
    "L_block": int(L_block),
    "trained_until": bundle["meta"]["trained_until"],
    "n_forecasts": bundle["meta"]["n_forecasts"]
}]).to_csv(LGBM_LAGS_META, index=False)

# Artefact dernier ensemble (optionnel : pour audit)
lgbm_artifact = {
    "trained_until": bundle["meta"]["trained_until"],
    "horizon": h,
    "features": features,
    "n_models_base": len(models),
    "best_params_last": best_params.copy(),
}

with open(LGBM_LAGS_LAST_PKL, "wb") as f:
    pickle.dump(lgbm_artifact, f)

pd.DataFrame([{
    "trained_until": lgbm_artifact["trained_until"],
    "n_features": len(features),
    "horizon": h,
    "n_models_base": lgbm_artifact["n_models_base"],
    "model": f"LightGBM+Bagging ({NAME_TAG})"
}]).to_csv(LGBM_LAGS_LAST_META, index=False)

print(f"\nüíæ Bundle OOS sauvegard√© ‚Üí {LGBM_LAGS_BUNDLE}")
print(f"üíæ M√©ta bundle       ‚Üí {LGBM_LAGS_META}")
print(f"üíæ Dernier mod√®le    ‚Üí {LGBM_LAGS_LAST_PKL}")
print(f"üíæ M√©ta dernier fit  ‚Üí {LGBM_LAGS_LAST_META}")

Ajout feature: UNRATE_lag12 (y_(t-h)) ‚Üí OK
‚úÖ Donn√©es pr√™tes : 1960-01-01 ‚Üí 2025-08-01 | n=788 | freq=MS
Features (11): ['TB3MS', 'RPI', 'INDPRO', 'DPCERA3M086SBEA', 'S&P500', 'BUSLOANS'] ...

‚úÖ Pseudo-OOS (LightGBM, with_UNRATE_lags_h12) ‚Äî n pr√©visions = 741

üìä Validation 83‚Äì89 ‚Äî n=84 | MAE=0.749 | RMSE=1.024 | R¬≤=-0.344
üìä Test 90‚Äì2025 ‚Äî n=428 | MAE=0.761 | RMSE=1.436 | R¬≤=0.113

üíæ Bundle OOS sauvegard√© ‚Üí lightgbm_regression__with_UNRATE_lags_h12.pkl
üíæ M√©ta bundle       ‚Üí lightgbm_regression_meta__with_UNRATE_lags_h12.csv
üíæ Dernier mod√®le    ‚Üí LGBM_last_trained_model__with_UNRATE_lags_h12.pkl
üíæ M√©ta dernier fit  ‚Üí LGBM_last_trained_model_meta__with_UNRATE_lags_h12.csv


# RIDGE avec lags = 12 du taux de ch√¥mage

In [15]:
# ==========================================================
# üîπ Ridge Regression + Bagging (pseudo-OOS, h=12)
#    ‚Üí ajoute UNRATE_lag12 comme variable explicative
#    ‚Üí gestion robuste des NaN (CV/fit/pred)
#    ‚Üí artefacts tagg√©s: with_UNRATE_lags_h12
# ==========================================================
import numpy as np
import pandas as pd
import pickle
from dateutil.relativedelta import relativedelta

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ---------- Param√®tres g√©n√©raux ----------
h = 12
min_train_n = 36           # ‚â• 3 ans avant de commencer √† pr√©voir
winsor_level = 0.01        # winsorisation (1er/99e percentiles)
norm_var = True            # normaliser ou non
target_col = "UNRATE"      # cible dans df_stationary

# S√©lection d'alpha : "cv" (CV 5-fold) OU une valeur float (ex: 1.0)
alpha_mode = "cv"
alpha_grid = np.logspace(-4, 4, 30)   # utilis√© si alpha_mode="cv"

# Fen√™tres d'√©valuation / test
eval_start = pd.Timestamp("1983-01-01")
eval_end   = pd.Timestamp("1989-12-31")
test_start = pd.Timestamp("1990-01-01")
test_end   = pd.Timestamp("2025-12-31")

# ---------- Bagging (bootstrap en blocs) ----------
use_bagging = True
B_boot = 30               # comme les auteurs
L_block = 12              # blocs annuels (12 mois)
rng = np.random.default_rng(123)  # seed bootstrap

# ---------- Fichiers de sortie (VARIANTE AVEC RETARDS) ----------
NAME_TAG = f"with_{target_col}_lags_h{h}"   # ex: with_UNRATE_lags_h12
RIDGE_LAGS_PKL  = f"ridge_regression__{NAME_TAG}.pkl"        # bundle (dict)
RIDGE_LAGS_META = f"ridge_regression_meta__{NAME_TAG}.csv"   # m√©ta r√©sum√©

# ---------- Pr√©paration df_stationary ----------
def _ensure_ms_index(df: pd.DataFrame) -> pd.DataFrame:
    """Force un index DatetimeIndex en d√©but de mois (MS)."""
    if "date" in df.columns:
        df = df.set_index("date")
    idx = pd.to_datetime(df.index)
    df = df.copy()
    df.index = idx.to_period("M").to_timestamp(how="start")
    return df.asfreq("MS")

# ‚ö†Ô∏è On suppose df_stationary d√©j√† charg√© en m√©moire
df_all = _ensure_ms_index(df_stationary).sort_index()

if target_col not in df_all.columns:
    raise ValueError(f"La colonne cible '{target_col}' est absente de df_stationary.")

y_all = df_all[target_col].astype(float)
X_all = df_all.drop(columns=[target_col]).astype(float)

# --- Ajout du lag h (=12) de la cible comme variable explicative ---
lag_feat_name = f"{target_col}_lag{h}"  # 'UNRATE_lag12'
X_all[lag_feat_name] = y_all.shift(h)

features = list(X_all.columns)

print(f"‚úÖ Donn√©es pr√™tes : {df_all.index.min().date()} ‚Üí {df_all.index.max().date()} | n={len(df_all)} | freq=MS")
print(f"Ajout feature: {lag_feat_name} (y_(t-h)) ‚Üí OK")
print(f"Features ({len(features)}): {features[:6]}{' ...' if len(features)>6 else ''}")

# ---------- Pr√©proc ----------
def fit_preproc(X: pd.DataFrame, wins=0.01, do_norm=True):
    """Apprend winsor + normalisation sur TRAIN et renvoie (X_trans, prep)."""
    lower = X.quantile(wins)
    upper = X.quantile(1 - wins)
    Xw = X.clip(lower=lower, upper=upper, axis=1)
    if do_norm:
        mean = Xw.mean()
        std  = Xw.std().replace(0, 1)
        Xn   = (Xw - mean) / std
        prep = {"lower": lower, "upper": upper, "mean": mean, "std": std, "norm": True}
        return Xn, prep
    else:
        prep = {"lower": lower, "upper": upper, "mean": None, "std": None, "norm": False}
        return Xw, prep

def apply_preproc(X: pd.DataFrame, prep: dict):
    """Applique le pr√©proc appris (pas de fuite)."""
    Xp = X.clip(lower=prep["lower"], upper=prep["upper"], axis=1)
    if prep["norm"]:
        Xp = (Xp - prep["mean"]) / prep["std"].replace(0, 1)
    return Xp

# ---------- S√©lection d'alpha ----------
def select_alpha(X_tr_p: pd.DataFrame, y_tr: pd.Series, mode="cv"):
    """Renvoie (alpha, cv_mae) si mode='cv', sinon (alpha, np.nan)."""
    if mode == "cv":
        model = Ridge(fit_intercept=True)
        grid = GridSearchCV(
            model,
            {"alpha": alpha_grid},
            scoring="neg_mean_absolute_error",
            cv=5,
            n_jobs=-1,
        )
        grid.fit(X_tr_p, y_tr.values)
        best_alpha = float(grid.best_estimator_.alpha)
        cv_mae = float(-grid.best_score_)
        return best_alpha, cv_mae
    else:
        try:
            a = float(mode)
        except Exception as e:
            raise ValueError(f"alpha_mode doit √™tre 'cv' ou un float. Re√ßu: {mode}") from e
        return a, np.nan

# ---------- Bootstrap utils ----------
def block_bootstrap_rows(index, L, rng):
    """Moving-block bootstrap sur index (positions)."""
    n = len(index)
    if n < 3:
        return np.arange(n)  # fallback
    L = max(2, min(int(L), n-1))
    nb = int(np.ceil(n / L))
    starts = rng.integers(0, n - L + 1, size=nb)
    ix = np.concatenate([np.arange(s, s+L) for s in starts])[:n]
    return ix

def bagged_predict_ridge(X_tr_raw: pd.DataFrame, y_tr: pd.Series, x_fore_raw: pd.DataFrame,
                         prep: dict, alpha: float, B: int, L: int, rng):
    """
    Bagging (moving-block bootstrap) pour Ridge :
      - pr√©proc fix√© sur TRAIN original (pas r√©-appris)
      - on droppe les lignes NaN apr√®s pr√©proc (Ridge n'accepte pas NaN)
      - bootstrap sur l'index des lignes propres
      - pr√©diction h ; renvoie (moyenne, distribution, base_pred)
    """
    # Pr√©proc du TRAIN + nettoyage
    X_tr_p = apply_preproc(X_tr_raw, prep)
    mask_clean = X_tr_p.notna().all(axis=1) & y_tr.notna()
    Xc = X_tr_p.loc[mask_clean]
    yc = y_tr.loc[mask_clean]

    if len(Xc) < 5:
        raise ValueError("Trop peu d'observations propres pour bagging Ridge.")

    # Pr√©proc de x_fore + imputation 0 (apr√®s normalisation 0 = moyenne)
    x_fore_p = apply_preproc(x_fore_raw, prep).fillna(0.0)

    # Base fit (r√©f√©rence) sur TRAIN propre
    base = Ridge(alpha=alpha, fit_intercept=True)
    base.fit(Xc.values, yc.values)
    yhat_base = float(base.predict(x_fore_p.values)[0])

    # Bootstrap sur l'index propre
    def _block_bootstrap_rows_from_clean(n_clean, L, rng):
        if n_clean < 3:
            return np.arange(n_clean)
        L_eff = max(2, min(int(L), n_clean - 1))
        nb = int(np.ceil(n_clean / L_eff))
        starts = rng.integers(0, n_clean - L_eff + 1, size=nb)
        ix_pos = np.concatenate([np.arange(s, s + L_eff) for s in starts])[:n_clean]
        return ix_pos

    preds = []
    n_clean = len(Xc)
    for b in range(B):
        ix_pos = _block_bootstrap_rows_from_clean(n_clean, L, rng)
        Xb = Xc.iloc[ix_pos]
        yb = yc.iloc[ix_pos]
        m = Ridge(alpha=alpha, fit_intercept=True)
        m.fit(Xb.values, yb.values)
        preds.append(float(m.predict(x_fore_p.values)[0]))

    return float(np.mean(preds)), np.array(preds), yhat_base

# ---------- Boucle pseudo-OOS ----------
rows = []                 # (date, y_pred, y_true, y_pred_base, p05, p95)
models = []               # stockage des mod√®les (un par fen√™tre)
preprocs = []             # stockage prep (align√©s aux mod√®les)
train_ends = []           # dates de fin train (pour trace)
alpha_history = []        # alpha utilis√© par fen√™tre
cv_mae_history = []       # MAE CV (si mode="cv"), sinon NaN

last_t_end = y_all.index.max() - relativedelta(months=h)
last_model = None
last_fit_end = None

for t_end in y_all.index:
    if t_end > last_t_end:
        break

    y_tr = y_all.loc[:t_end]
    X_tr = X_all.loc[:t_end]
    if len(y_tr) < min_train_n:
        continue

    # Pr√©proc appris sur TRAIN courant
    X_tr_p, prep = fit_preproc(X_tr, wins=winsor_level, do_norm=norm_var)

    # üîß Nettoyage TRAIN (Ridge / GridSearchCV n'acceptent pas les NaN)
    mask_clean = X_tr_p.notna().all(axis=1) & y_tr.notna()
    X_tr_p_clean = X_tr_p.loc[mask_clean]
    y_tr_clean   = y_tr.loc[mask_clean]

    # S√©curit√© : si trop peu d'observations propres, on saute cette it√©ration
    if len(X_tr_p_clean) < 10:
        continue

    # Choix alpha (CV ou fixe) sur TRAIN propre
    if alpha_mode == "cv" and len(X_tr_p_clean) < 25:  # ~5 obs/fold min
        alpha, cv_mae = 1.0, np.nan
    else:
        alpha, cv_mae = select_alpha(X_tr_p_clean, y_tr_clean, mode=alpha_mode)
    alpha_history.append(alpha)
    cv_mae_history.append(cv_mae)

    # Horizon cibl√©
    t_fore = t_end + relativedelta(months=h)
    if t_fore in y_all.index:
        x_fore_raw = X_all.loc[[t_fore]]
        # üîß Pas de NaN √† la pr√©diction (apr√®s normalisation, 0 = moyenne)
        x_fore_p = apply_preproc(x_fore_raw, prep).fillna(0.0)

        if use_bagging:
            yhat_h, dist, yhat_base = bagged_predict_ridge(
                X_tr_raw=X_tr, y_tr=y_tr, x_fore_raw=x_fore_raw,
                prep=prep, alpha=alpha, B=B_boot, L=L_block, rng=rng
            )
            y_p05 = float(np.percentile(dist, 5))
            y_p95 = float(np.percentile(dist, 95))
        else:
            model_tmp = Ridge(alpha=alpha, fit_intercept=True)
            model_tmp.fit(X_tr_p_clean.values, y_tr_clean.values)
            yhat_h = float(model_tmp.predict(x_fore_p.values)[0])
            yhat_base = yhat_h
            y_p05, y_p95 = (np.nan, np.nan)

        rows.append((t_fore, yhat_h, float(y_all.loc[t_fore]), yhat_base, y_p05, y_p95))

    # trace / dernier mod√®le base (utile pour sauvegarde) ‚Äî entra√Æn√© sur TRAIN propre
    last_model = Ridge(alpha=alpha, fit_intercept=True).fit(X_tr_p_clean.values, y_tr_clean.values)
    last_fit_end = t_end
    models.append(last_model)
    preprocs.append(prep)
    train_ends.append(t_end)

# ---------- DataFrame OOS ----------
if rows:
    df_oos = (
        pd.DataFrame(rows, columns=["date", "y_pred", "y_true", "y_pred_base", "y_pred_p05", "y_pred_p95"])
          .assign(date=lambda d: pd.to_datetime(d["date"]).dt.to_period("M").dt.to_timestamp(how="start"))
          .set_index("date").sort_index()
    )
else:
    df_oos = pd.DataFrame(columns=["y_pred", "y_true", "y_pred_base", "y_pred_p05", "y_pred_p95"])
    df_oos.index = pd.to_datetime(pd.Index([]))

print(f"\n‚úÖ Pseudo-OOS (Ridge, {NAME_TAG}) ‚Äî n pr√©visions = {len(df_oos)}")
print(df_oos.head(3))

# ---------- Scores ----------
def _scores(df: pd.DataFrame):
    if len(df) == 0:
        return {"MAE": np.nan, "RMSE": np.nan, "R2": np.nan}
    mae  = mean_absolute_error(df["y_true"], df["y_pred"])
    rmse = np.sqrt(mean_squared_error(df["y_true"], df["y_pred"]))
    r2   = r2_score(df["y_true"], df["y_pred"]) if len(df) > 1 else np.nan
    return {"MAE": float(mae), "RMSE": float(rmse), "R2": float(r2)}

df_val  = df_oos.loc[eval_start:eval_end].copy()
df_test = df_oos.loc[test_start:test_end].copy()

sc_val  = _scores(df_val)
sc_test = _scores(df_test)

print(f"\nüìä Validation 83‚Äì89 ‚Äî n={len(df_val)} | MAE={sc_val['MAE']:.3f} | RMSE={sc_val['RMSE']:.3f} | R¬≤={sc_val['R2']:.3f}")
print(f"üìä Test 90‚Äì{test_end.year} ‚Äî n={len(df_test)} | MAE={sc_test['MAE']:.3f} | RMSE={sc_test['RMSE']:.3f} | R¬≤={sc_test['R2']:.3f}")

# (option) Comparaison bagging vs base
if "y_pred_base" in df_oos and df_oos["y_pred_base"].notna().any():
    mae_bag  = mean_absolute_error(df_oos["y_true"], df_oos["y_pred"])
    mae_base = mean_absolute_error(df_oos["y_true"], df_oos["y_pred_base"])
    print(f"‚û°Ô∏è  Gain bagging (ŒîMAE) = {mae_base - mae_bag:.3f}")

# ---------- Sauvegardes ----------
bundle = {
    "oos_predictions": df_oos.reset_index(),     # (date, y_pred, y_true, y_pred_base, y_pred_p05, y_pred_p95)
    "params": {
        "model": f"Ridge ({NAME_TAG})",
        "horizon": h,
        "min_train_n": min_train_n,
        "winsor_level": winsor_level,
        "norm_var": norm_var,
        "features": features,
        "eval_window": (str(eval_start.date()), str(eval_end.date())),
        "test_window": (str(test_start.date()), str(test_end.date())),
        # ---- bagging ----
        "use_bagging": bool(use_bagging),
        "B_boot": int(B_boot),
        "L_block": int(L_block),
        # ---- alpha ----
        "alpha_mode": alpha_mode,
        "alpha_grid": list(alpha_grid) if alpha_mode == "cv" else None,
        "best_alpha_last": (alpha_history[-1] if len(alpha_history) else None),
    },
    "meta": {
        "trained_until": str(last_fit_end.date()) if last_fit_end is not None else None,
        "index_freq": "MS",
        "n_obs_all": int(len(df_all)),
        "n_forecasts": int(len(df_oos)),
        "alpha_history": alpha_history,
        "cv_mae_history": cv_mae_history,
    },
    "train_fit_dates": pd.to_datetime(pd.Index(train_ends)),

    # ‚úÖ Pour permutation_importance_pseudo_oos & SHAP
    "models":   models,     # liste des mod√®les Ridge (un par fen√™tre)
    "preprocs": preprocs,   # liste des pr√©proc (dict) align√©s aux mod√®les
}

# --- Sauvegarde du bundle complet ---
with open(RIDGE_LAGS_PKL, "wb") as f:
    pickle.dump(bundle, f)

# --- Sauvegarde du r√©sum√© m√©ta s√©par√© (lisible rapidement) ---
pd.DataFrame([{
    "model": f"Ridge ({NAME_TAG})",
    "horizon": h,
    "min_train_n": min_train_n,
    "winsor_level": winsor_level,
    "norm_var": norm_var,
    "use_bagging": bool(use_bagging),
    "B_boot": int(B_boot),
    "L_block": int(L_block),
    "alpha_mode": alpha_mode,
    "best_alpha_last": bundle["params"]["best_alpha_last"],
    "trained_until": bundle["meta"]["trained_until"],
    "n_forecasts": bundle["meta"]["n_forecasts"],
}]).to_csv(RIDGE_LAGS_META, index=False)

print(f"\nüíæ Bundle sauvegard√© ‚Üí {RIDGE_LAGS_PKL}")
print(f"üíæ M√©ta sauvegard√©e ‚Üí {RIDGE_LAGS_META}")
print(f"üì¶ Contenu du bundle : {list(bundle.keys())}")

‚úÖ Donn√©es pr√™tes : 1960-01-01 ‚Üí 2025-08-01 | n=788 | freq=MS
Ajout feature: UNRATE_lag12 (y_(t-h)) ‚Üí OK
Features (11): ['TB3MS', 'RPI', 'INDPRO', 'DPCERA3M086SBEA', 'S&P500', 'BUSLOANS'] ...

‚úÖ Pseudo-OOS (Ridge, with_UNRATE_lags_h12) ‚Äî n pr√©visions = 741
              y_pred  y_true  y_pred_base  y_pred_p05  y_pred_p95
date                                                             
1963-12-01  1.040464     0.0     0.281075   -0.155252    1.481465
1964-01-01  0.728928    -0.1    -0.028176    0.076145    1.154382
1964-02-01  0.264863    -0.5     0.307588   -0.430290    0.729122

üìä Validation 83‚Äì89 ‚Äî n=84 | MAE=0.808 | RMSE=1.023 | R¬≤=-0.340
üìä Test 90‚Äì2025 ‚Äî n=428 | MAE=0.794 | RMSE=1.486 | R¬≤=0.051
‚û°Ô∏è  Gain bagging (ŒîMAE) = 0.005

üíæ Bundle sauvegard√© ‚Üí ridge_regression__with_UNRATE_lags_h12.pkl
üíæ M√©ta sauvegard√©e ‚Üí ridge_regression_meta__with_UNRATE_lags_h12.csv
üì¶ Contenu du bundle : ['oos_predictions', 'params', 'meta', 'train_fit_dat

# LightGBMnoUSREC

In [22]:
# ==========================================================
# üîπ LightGBM + Bagging (pseudo-OOS, h=12)
#    ‚Üí VARIANTE SANS USREC (mais AVEC y_{t-h})
#    ‚Üí artefacts tagg√©s: with_<target>_lags_h<h>__noUSREC
# ==========================================================
import numpy as np
import pandas as pd
import pickle
from dateutil.relativedelta import relativedelta

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from lightgbm import LGBMRegressor

# ---------- Param√®tres g√©n√©raux ----------
h = 12
min_train_n = 36
winsor_level = 0.01
norm_var = True
target_col = "UNRATE"

# Fen√™tres d‚Äô√©val / test
eval_start = pd.Timestamp("1983-01-01")
eval_end   = pd.Timestamp("1989-12-31")
test_start = pd.Timestamp("1990-01-01")
test_end   = pd.Timestamp("2025-12-31")

# ---------- Refit/Retune ----------
refit_every_months  = 12
retune_every_months = 36

# ---------- Bagging (moving-block bootstrap) ----------
use_bagging = True
B_boot = 30
L_block = 12
rng = np.random.default_rng(12345)

# ---------- Fichiers de sortie (VARIANTE SANS USREC) ----------
NAME_TAG = f"with_{target_col}_lags_h{h}__noUSREC"   # ‚Üê clair et explicite
LGBM_NOUSREC_LAGS_BUNDLE    = f"lightgbm_regression__{NAME_TAG}.pkl"
LGBM_NOUSREC_LAGS_META      = f"lightgbm_regression_meta__{NAME_TAG}.csv"
LGBM_NOUSREC_LAGS_LAST_PKL  = f"LGBM_last_trained_model__{NAME_TAG}.pkl"
LGBM_NOUSREC_LAGS_LAST_META = f"LGBM_last_trained_model_meta__{NAME_TAG}.csv"

# ---------- Pr√©paration df_stationary ----------
def _ensure_ms_index(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "date" in df.columns:
        df = df.set_index("date")
    idx = pd.to_datetime(df.index)
    df.index = idx.to_period("M").to_timestamp(how="start")
    return df.asfreq("MS")

# ‚ö†Ô∏è On suppose df_stationary dispo
df_all = _ensure_ms_index(df_stationary).sort_index()

if target_col not in df_all.columns:
    raise ValueError(f"La cible '{target_col}' est absente de df_stationary.")

y_all = df_all[target_col].astype(float)
X_all = df_all.drop(columns=[target_col]).astype(float)

# --- ‚ùå Supprimer toute colonne USREC (USREC, USREC_lag*, etc.) ---
usrec_cols = [c for c in X_all.columns if "USREC" in str(c).upper()]
if usrec_cols:
    X_all = X_all.drop(columns=usrec_cols)
    print(f"üßπ Colonnes USREC exclues: {usrec_cols}")

# --- ‚úÖ Ajout du lag h (=12) de la cible comme variable explicative ---
lag_feat_name = f"{target_col}_lag{h}"   # ex: 'UNRATE_lag12'
X_all[lag_feat_name] = y_all.shift(h)

# LightGBM : √©viter espaces dans noms de colonnes
X_all.columns = [str(c).replace(" ", "_") for c in X_all.columns]
features = list(X_all.columns)

print(f"Ajout feature: {lag_feat_name} (y_(t-h)) ‚Üí OK")
print(f"‚úÖ Donn√©es pr√™tes : {df_all.index.min().date()} ‚Üí {df_all.index.max().date()} | n={len(df_all)} | freq=MS")
print(f"Features ({len(features)}): {features[:6]}{' ...' if len(features)>6 else ''}")

# ---------- Pr√©proc ----------
def fit_preproc(X: pd.DataFrame, wins=0.01, do_norm=True):
    lower = X.quantile(wins)
    upper = X.quantile(1 - wins)
    Xw = X.clip(lower=lower, upper=upper, axis=1)
    if do_norm:
        mean = Xw.mean()
        std  = Xw.std().replace(0, 1)
        Xn   = (Xw - mean) / std
        prep = {"lower": lower, "upper": upper, "mean": mean, "std": std, "norm": True}
        return Xn, prep
    else:
        prep = {"lower": lower, "upper": upper, "mean": None, "std": None, "norm": False}
        return Xw, prep

def apply_preproc(X: pd.DataFrame, prep: dict):
    Xp = X.clip(lower=prep["lower"], upper=prep["upper"], axis=1)
    if prep["norm"]:
        Xp = (Xp - prep["mean"]) / prep["std"].replace(0, 1)
    return Xp

# ---------- hv-block CV ----------
class HVBlockCV:
    def __init__(self, n_splits=5, gap=12):
        self.n_splits = n_splits
        self.gap = 12 if gap is None else int(gap)
    def split(self, X, y=None, groups=None):
        n = len(X)
        fold_sizes = np.full(self.n_splits, n // self.n_splits, dtype=int)
        fold_sizes[: n % self.n_splits] += 1
        idx = np.arange(n)
        cur = 0
        for fs in fold_sizes:
            start, stop = cur, cur + fs
            test_idx = idx[start:stop]
            train_mask = np.ones(n, dtype=bool)
            left = max(0, start - self.gap)
            right = min(n, stop + self.gap)
            train_mask[left:right] = False
            train_idx = idx[train_mask]
            cur = stop
            if len(train_idx) == 0 or len(test_idx) == 0:
                continue
            yield train_idx, test_idx
    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

# ---------- Grille LightGBM ----------
param_dist = {
    "subsample":        [0.05,.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0],
    "colsample_bytree": [.2,.3,.4,.5,.6,.7,1.0],
    "num_leaves":       [2,3,4,5,8,10,20,40,70,100],
    "n_estimators":     [5,10,20,30,40,50,75,100],
    "max_depth":        [1,2,3,5,8,15,-1],
    "reg_alpha":        [0, .1, 1, 2, 7, 10, 50, 100],
    "reg_lambda":       [0, .1, 1, 10, 20, 50, 100],
    "min_child_samples":[5,10,15],
    "min_split_gain":   [0.0, 0.01, 0.05],
}

def tune_lgbm(X_tr_p: pd.DataFrame, y_tr: np.ndarray, seed=12345):
    cv = HVBlockCV(n_splits=5, gap=12)
    base = LGBMRegressor(
        boosting_type="gbdt",
        objective="regression",
        importance_type="gain",
        random_state=seed,
        n_jobs=1,
        verbose=-1
    )
    rs = RandomizedSearchCV(
        estimator=base,
        param_distributions=param_dist,
        n_iter=100,
        scoring="neg_mean_absolute_error",
        cv=cv,
        n_jobs=-1,
        random_state=seed,
        refit=True,
        verbose=0
    )
    rs.fit(X_tr_p, y_tr)
    return rs.best_params_, float(-rs.best_score_), rs.best_estimator_

# ---------- Bootstrap utils ----------
def block_bootstrap_rows(index, L, rng):
    n = len(index)
    if n < 3:
        return np.arange(n)
    L = max(2, min(int(L), n-1))
    nb = int(np.ceil(n / L))
    starts = rng.integers(0, n - L + 1, size=nb)
    ix = np.concatenate([np.arange(s, s+L) for s in starts])[:n]
    return ix

def bagged_predict_lgbm(X_tr_raw, y_tr, x_fore_raw, prep, best_params, B, L, rng):
    X_tr_p = apply_preproc(X_tr_raw, prep)
    base = LGBMRegressor(
        boosting_type="gbdt",
        objective="regression",
        importance_type="gain",
        n_jobs=1,
        random_state=123,
        verbose=-1,
        **best_params
    )
    base.fit(X_tr_p, y_tr.values)
    yhat_base = float(base.predict(apply_preproc(x_fore_raw, prep))[0])

    preds = []
    for b in range(B):
        ix = block_bootstrap_rows(X_tr_raw.index, L, rng)
        Xb = X_tr_raw.iloc[ix]
        yb = y_tr.iloc[ix]
        Xb_p = apply_preproc(Xb, prep)
        m = LGBMRegressor(
            boosting_type="gbdt",
            objective="regression",
            importance_type="gain",
            n_jobs=1,
            random_state=123 + b,
            verbose=-1,
            **best_params
        )
        m.fit(Xb_p, yb.values)
        preds.append(float(m.predict(apply_preproc(x_fore_raw, prep))[0]))
    return float(np.mean(preds)), np.array(preds), yhat_base

# ---------- Boucle pseudo-OOS ----------
def _months_between(a: pd.Timestamp, b: pd.Timestamp) -> int:
    return (b.year - a.year) * 12 + (b.month - a.month)

rows = []
models = []
preprocs = []
train_ends = []
cv_mae_history = []
best_params_hist = []

last_fit_end = None
last_t_end   = y_all.index.max() - relativedelta(months=h)
last_refit_t = None
last_tune_t  = None

best_params = {}
base_model = None

for t_end in y_all.index:
    if t_end > last_t_end:
        break

    y_tr = y_all.loc[:t_end]
    X_tr = X_all.loc[:t_end]
    if len(y_tr) < min_train_n:
        continue

    need_refit = (last_refit_t is None) or (_months_between(last_refit_t, t_end) >= refit_every_months)
    need_tune  = (t_end >= eval_start) and (last_tune_t is None or _months_between(last_tune_t, t_end) >= retune_every_months)

    X_tr_p_global, prep_global = fit_preproc(X_tr, wins=winsor_level, do_norm=norm_var)

    if need_tune:
        best_params, best_cv_mae, _ = tune_lgbm(X_tr_p_global, y_tr.values, seed=12345)
        last_tune_t = t_end
        cv_mae_history.append(best_cv_mae)
        best_params_hist.append(best_params.copy())
    else:
        cv_mae_history.append(np.nan)
        best_params_hist.append(best_params.copy() if best_params else {})

    if not best_params:
        best_params = dict(
            subsample=0.7, colsample_bytree=0.7, num_leaves=31,
            n_estimators=100, max_depth=-1, reg_alpha=0.0, reg_lambda=0.0,
            min_child_samples=10, min_split_gain=0.0
        )

    if need_refit:
        base_model = LGBMRegressor(
            boosting_type="gbdt",
            objective="regression",
            importance_type="gain",
            n_jobs=1,
            random_state=12345,
            verbose=-1,
            **best_params
        )
        base_model.fit(X_tr_p_global, y_tr.values)
        models.append(base_model)
        preprocs.append(prep_global)
        train_ends.append(t_end)
        last_refit_t = t_end
        last_fit_end = t_end

    t_fore = t_end + relativedelta(months=h)
    if t_fore in y_all.index:
        x_fore_raw = X_all.loc[[t_fore]]
        if use_bagging:
            yhat, dist, yhat_base = bagged_predict_lgbm(
                X_tr_raw=X_tr, y_tr=y_tr, x_fore_raw=x_fore_raw,
                prep=prep_global, best_params=best_params,
                B=B_boot, L=L_block, rng=rng
            )
            y_p05 = float(np.percentile(dist, 5))
            y_p95 = float(np.percentile(dist, 95))
        else:
            if base_model is None:
                base_model = LGBMRegressor(
                    boosting_type="gbdt",
                    objective="regression",
                    importance_type="gain",
                    n_jobs=1,
                    random_state=12345,
                    verbose=-1,
                    **best_params
                ).fit(X_tr_p_global, y_tr.values)
            yhat = float(base_model.predict(apply_preproc(x_fore_raw, prep_global))[0])
            yhat_base = yhat
            y_p05, y_p95 = (np.nan, np.nan)

        rows.append((t_fore, yhat, float(y_all.loc[t_fore]), yhat_base, y_p05, y_p95))

# ---------- DataFrame OOS ----------
if rows:
    df_oos = (
        pd.DataFrame(rows, columns=["date", "y_pred", "y_true", "y_pred_base", "y_pred_p05", "y_pred_p95"])
          .assign(date=lambda d: pd.to_datetime(d["date"]).dt.to_period("M").dt.to_timestamp(how="start"))
          .set_index("date").sort_index()
    )
else:
    df_oos = pd.DataFrame(columns=["y_pred","y_true","y_pred_base","y_pred_p05","y_pred_p95"])
    df_oos.index = pd.to_datetime(pd.Index([]))

print(f"\n‚úÖ Pseudo-OOS (LightGBM, {NAME_TAG}) ‚Äî n pr√©visions = {len(df_oos)}")

# ---------- Scores ----------
def _scores(df):
    if len(df) == 0:
        return {"MAE": np.nan, "RMSE": np.nan, "R2": np.nan}
    mae  = mean_absolute_error(df["y_true"], df["y_pred"])
    rmse = np.sqrt(mean_squared_error(df["y_true"], df["y_pred"]))
    r2   = r2_score(df["y_true"], df["y_pred"]) if len(df) > 1 else np.nan
    return {"MAE": float(mae), "RMSE": float(rmse), "R2": float(r2)}

df_val  = df_oos.loc[eval_start:eval_end].copy()
df_test = df_oos.loc[test_start:test_end].copy()
sc_val  = _scores(df_val)
sc_test = _scores(df_test)

print(f"\nüìä Validation 83‚Äì89 ‚Äî n={len(df_val)} | MAE={sc_val['MAE']:.3f} | RMSE={sc_val['RMSE']:.3f} | R¬≤={sc_val['R2']:.3f}")
print(f"üìä Test 90‚Äì{test_end.year} ‚Äî n={len(df_test)} | MAE={sc_test['MAE']:.3f} | RMSE={sc_test['RMSE']:.3f} | R¬≤={sc_test['R2']:.3f}")

# ---------- Sauvegardes ----------
bundle = {
    "oos_predictions": df_oos.reset_index(),
    "params": {
        "model": f"LightGBM + Bagging ({NAME_TAG})",
        "horizon": h,
        "min_train_n": min_train_n,
        "winsor_level": winsor_level,
        "norm_var": norm_var,
        "features": features,   # ‚Üê SANS aucune USREC
        "eval_window": (str(eval_start.date()), str(eval_end.date())),
        "test_window": (str(test_start.date()), str(test_end.date())),
        "refit_every_months": int(refit_every_months),
        "retune_every_months": int(retune_every_months),
        "hyper_search": "RandomizedSearchCV (100 iters) + hv-block CV (5 folds, gap=12), scoring=MAE",
        "best_params_last": best_params.copy(),
        "use_bagging": bool(use_bagging),
        "B_boot": int(B_boot),
        "L_block": int(L_block),
    },
    "meta": {
        "trained_until": str(last_fit_end.date()) if last_fit_end is not None else None,
        "index_freq": "MS",
        "n_obs_all": int(len(df_all)),
        "n_forecasts": int(len(df_oos)),
        "cv_mae_history": cv_mae_history,
        "best_params_history": best_params_hist
    },
    "train_fit_dates": pd.to_datetime(pd.Index(train_ends)),
    "models":   models,
    "preprocs": preprocs,
}

with open(LGBM_NOUSREC_LAGS_BUNDLE, "wb") as f:
    pickle.dump(bundle, f)

pd.DataFrame([{
    "model": f"LightGBM+Bagging ({NAME_TAG})",
    "horizon": h,
    "min_train_n": min_train_n,
    "winsor_level": winsor_level,
    "norm_var": norm_var,
    "refit_every_months": int(refit_every_months),
    "retune_every_months": int(retune_every_months),
    "use_bagging": bool(use_bagging),
    "B_boot": int(B_boot),
    "L_block": int(L_block),
    "trained_until": bundle["meta"]["trained_until"],
    "n_forecasts": bundle["meta"]["n_forecasts"]
}]).to_csv(LGBM_NOUSREC_LAGS_META, index=False)

lgbm_artifact = {
    "trained_until": bundle["meta"]["trained_until"],
    "horizon": h,
    "features": features,
    "n_models_base": len(models),
    "best_params_last": best_params.copy(),
}

with open(LGBM_NOUSREC_LAGS_LAST_PKL, "wb") as f:
    pickle.dump(lgbm_artifact, f)

pd.DataFrame([{
    "trained_until": lgbm_artifact["trained_until"],
    "n_features": len(features),
    "horizon": h,
    "n_models_base": lgbm_artifact["n_models_base"],
    "model": f"LightGBM+Bagging ({NAME_TAG})"
}]).to_csv(LGBM_NOUSREC_LAGS_LAST_META, index=False)

print(f"\nüíæ Bundle OOS sauvegard√© ‚Üí {LGBM_NOUSREC_LAGS_BUNDLE}")
print(f"üíæ M√©ta bundle       ‚Üí {LGBM_NOUSREC_LAGS_META}")
print(f"üíæ Dernier mod√®le    ‚Üí {LGBM_NOUSREC_LAGS_LAST_PKL}")
print(f"üíæ M√©ta dernier fit  ‚Üí {LGBM_NOUSREC_LAGS_LAST_META}")


üßπ Colonnes USREC exclues: ['USREC']
Ajout feature: UNRATE_lag12 (y_(t-h)) ‚Üí OK
‚úÖ Donn√©es pr√™tes : 1960-01-01 ‚Üí 2025-08-01 | n=788 | freq=MS
Features (10): ['TB3MS', 'RPI', 'INDPRO', 'DPCERA3M086SBEA', 'S&P500', 'BUSLOANS'] ...

‚úÖ Pseudo-OOS (LightGBM, with_UNRATE_lags_h12__noUSREC) ‚Äî n pr√©visions = 741

üìä Validation 83‚Äì89 ‚Äî n=84 | MAE=0.827 | RMSE=1.076 | R¬≤=-0.482
üìä Test 90‚Äì2025 ‚Äî n=428 | MAE=0.767 | RMSE=1.401 | R¬≤=0.157

üíæ Bundle OOS sauvegard√© ‚Üí lightgbm_regression__with_UNRATE_lags_h12__noUSREC.pkl
üíæ M√©ta bundle       ‚Üí lightgbm_regression_meta__with_UNRATE_lags_h12__noUSREC.csv
üíæ Dernier mod√®le    ‚Üí LGBM_last_trained_model__with_UNRATE_lags_h12__noUSREC.pkl
üíæ M√©ta dernier fit  ‚Üí LGBM_last_trained_model_meta__with_UNRATE_lags_h12__noUSREC.csv
