# Package

In [2]:
import pandas as pd

# Importation des données

In [3]:
# Les données de test
df_stationary_test = pd.read_csv("df_stationary_test.csv", index_col="date")
df_stationary_test.index = pd.to_datetime(df_stationary_test.index)

In [4]:
df_stationary = pd.read_csv("df_stationary.csv", index_col="date")

In [5]:
df_stationary_unrate = df_stationary["UNRATE"]

In [10]:
# ---------- Préparation de la série ----------
# Si tu as déjà :
# df_stationary_unrate = df_stationary["UNRATE"]

y = df_stationary_unrate.copy()

# Vérifie que l’index est bien une date (sinon essaie de le convertir)
if not isinstance(y.index, (pd.DatetimeIndex, pd.PeriodIndex)):
    y.index = pd.to_datetime(y.index, errors="coerce")

# Force la fréquence mensuelle (début de mois)
y.index = y.index.to_period("M").to_timestamp(how="start")
y = y.sort_index().asfreq("MS").astype(float).dropna()

print(f"✅ Série prête : {y.index.min().date()} → {y.index.max().date()} | n={len(y)} | freq={y.index.freqstr}")

✅ Série prête : 1960-01-01 → 2025-08-01 | n=788 | freq=MS


# Test AR(p)

In [21]:
# ==========================================
# AR(p) — Pseudo-OOS continu (h=12) avec re-CV triennale
# réutilise la Series `y` déjà prête (freq=MS)
# ==========================================
import os, pickle, joblib
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ---------- Paramètres ----------
h = 12
min_train_n = 36                 # ≥ 3 ans de données pour démarrer
trend = "c"                      # "c" = constante ; "n" = sans constante
p_grid = range(1, 13)            # p ∈ {1,…,12}
cv_update_every_months = 36
cv_anchor = pd.Timestamp("1983-01-01")

# ---------- Utilitaires ----------
def months_since(anchor, t):
    return (t.year - anchor.year) * 12 + (t.month - anchor.month)

def rolling_mae_for_p(y_series, p, h, min_train):
    """MAE rolling à l'horizon h pour un p donné (sur y_series, en respectant l'ordre temporel)."""
    rows = []
    last_t_end = y_series.index.max() - relativedelta(months=h)
    for t_end in y_series.index:
        if t_end > last_t_end:
            break
        y_tr = y_series.loc[:t_end]
        if len(y_tr) < max(min_train, p + 1):
            continue
        model = AutoReg(y_tr, lags=p, old_names=False, trend=trend).fit()
        fc = model.predict(start=len(y_tr), end=len(y_tr) + h - 1)
        yhat_h = float(fc.iloc[-1])
        t_fore = t_end + relativedelta(months=h)
        if t_fore in y_series.index:
            rows.append((t_fore, yhat_h, float(y_series.loc[t_fore])))
    if not rows:
        return np.inf
    tmp = pd.DataFrame(rows, columns=["date", "y_hat", "y_true"]).set_index("date")
    return float(mean_absolute_error(tmp["y_true"], tmp["y_hat"]))

def select_p_by_cv(y_tr, p_grid, h, min_train):
    """Sélectionne p* minimisant le MAE(h) rolling calculé sur l'échantillon d'entraînement courant."""
    best_p, best_score = None, np.inf
    for p in p_grid:
        score = rolling_mae_for_p(y_tr, p, h, min_train)
        if score < best_score:
            best_score, best_p = score, p
    return int(best_p if best_p is not None else 1)

# ---------- Boucle pseudo-OOS continue ----------
rows = []
last_model = None
last_fit_end = None
current_p = None

# s'assurer que y est bien MS
y = pd.Series(y.astype(float).values, index=pd.to_datetime(y.index)).asfreq("MS").dropna()

last_t_end = y.index.max() - relativedelta(months=h)

for t_end in y.index:
    if t_end > last_t_end:
        break

    y_tr = y.loc[:t_end]
    if len(y_tr) < min_train_n:
        continue

    # re-CV à partir de 1983-01 tous les 36 mois
    if t_end >= cv_anchor:
        m = months_since(cv_anchor, t_end)
        need_cv = (m % cv_update_every_months == 0)
    else:
        need_cv = False

    if current_p is None and not need_cv:
        current_p = 1  # p par défaut avant la première re-CV

    if need_cv:
        current_p = select_p_by_cv(y_tr, p_grid, h, min_train_n)
        print(f"[CV] {t_end.date()} → p* = {current_p}")

    # fit AR(p) avec p courant
    arp = AutoReg(y_tr, lags=current_p, old_names=False, trend=trend).fit()
    last_model = arp
    last_fit_end = t_end

    # prévision à h mois (valeur à l'horizon)
    fc = arp.predict(start=len(y_tr), end=len(y_tr) + h - 1)
    yhat_h = float(fc.iloc[-1])

    t_fore = t_end + relativedelta(months=h)
    if t_fore in y.index:
        rows.append((t_fore, yhat_h, float(y.loc[t_fore]), int(current_p)))

# ---------- DataFrame unique ----------
if rows:
    df_oos_arp = (
        pd.DataFrame(rows, columns=["date", "y_hat", "y_true", "p_used"])
          .set_index("date").sort_index()
    )
else:
    df_oos_arp = pd.DataFrame(columns=["y_hat", "y_true", "p_used"])
    df_oos_arp.index = pd.to_datetime(pd.Index([]))

print(f"\n✅ Pseudo-OOS terminé — n prévisions = {len(df_oos_arp)}")
print(df_oos_arp.head(3))

# ---------- (facultatif) Scores par période ----------
if len(df_oos_arp):
    df_val  = df_oos_arp.loc["1983-01-01":"1989-12-31"].copy()
    df_test = df_oos_arp.loc["1990-01-01":"2025-08-31"].copy()

    if len(df_val):
        mae  = mean_absolute_error(df_val["y_true"], df_val["y_hat"])
        rmse = np.sqrt(mean_squared_error(df_val["y_true"], df_val["y_hat"]))
        r2   = r2_score(df_val["y_true"], df_val["y_hat"]) if len(df_val) > 1 else np.nan
        print(f"\n📊 Validation 83–89 — n={len(df_val)} | MAE={mae:.3f} | RMSE={rmse:.3f} | R²={r2:.3f}")

    if len(df_test):
        mae  = mean_absolute_error(df_test["y_true"], df_test["y_hat"])
        rmse = np.sqrt(mean_squared_error(df_test["y_true"], df_test["y_hat"]))
        r2   = r2_score(df_test["y_true"], df_test["y_hat"]) if len(df_test) > 1 else np.nan
        print(f"📊 Test 90–2025 — n={len(df_test)} | MAE={mae:.3f} | RMSE={rmse:.3f} | R²={r2:.3f}")

# ---------- Sauvegardes ----------
ARP_LAST_PKL  = "ARP_last_trained_model.pkl"
ARP_LAST_META = "ARP_last_trained_model_meta.csv"
ARP_BUNDLE    = "ARP_h12_oos_bundle.pkl"

# 1) modèle final
if last_model is not None:
    try:
        joblib.dump(last_model, ARP_LAST_PKL)
        print(f"💾 Modèle AR(p) sauvegardé → {ARP_LAST_PKL}")
    except Exception:
        with open(ARP_LAST_PKL, "wb") as f:
            pickle.dump(last_model, f)
        print(f"💾 Modèle AR(p) sauvegardé (pickle) → {ARP_LAST_PKL}")

# 2) bundle des sorties  ✅ correctif: to_timestamp(how="start")
bundle = {
    "oos_predictions": (
        df_oos_arp.reset_index()
                  .rename(columns={"y_hat": "y_pred"})
                  .assign(
                      date=lambda d: pd.to_datetime(d["date"])
                                      .dt.to_period("M")
                                      .dt.to_timestamp(how="start")
                  )
    ),
    "params": {
        "model": "AR(p)",
        "trend": trend,
        "horizon": h,
        "cv_update_every_months": cv_update_every_months,
        "p_grid": list(p_grid),
        "min_train_n": min_train_n,
        "anchor": str(cv_anchor.date())
    },
    "meta": {
        "trained_until": str(last_fit_end.date()) if last_fit_end is not None else None,
        "p_used_last": int(current_p if current_p is not None else 1),
        "index_freq": "MS",
        "n_obs_y": int(len(y)),
        "n_forecasts": int(len(df_oos_arp))
    }
}
with open(ARP_BUNDLE, "wb") as f:
    pickle.dump(bundle, f)
print(f"💾 Bundle AR(p) OOS sauvegardé → {ARP_BUNDLE}")

# 3) méta csv
meta_row = {
    "model": "AR(p)",
    "trend": trend,
    "cv_every_months": cv_update_every_months,
    "anchor": cv_anchor.strftime("%Y-%m-%d"),
    "p_used_last": int(current_p if current_p is not None else 1),
    "trained_until": str(last_fit_end.date()) if last_fit_end is not None else None,
    "n_obs_y": int(len(y)),
    "n_forecasts": int(len(df_oos_arp))
}
pd.DataFrame([meta_row]).to_csv(ARP_LAST_META)
print(f"💾 Méta AR(p) sauvegardée → {ARP_LAST_META}")

[CV] 1983-01-01 → p* = 5
[CV] 1986-01-01 → p* = 4
[CV] 1989-01-01 → p* = 4
[CV] 1992-01-01 → p* = 4
[CV] 1995-01-01 → p* = 4
[CV] 1998-01-01 → p* = 4
[CV] 2001-01-01 → p* = 4
[CV] 2004-01-01 → p* = 4
[CV] 2007-01-01 → p* = 4
[CV] 2010-01-01 → p* = 4
[CV] 2013-01-01 → p* = 4
[CV] 2016-01-01 → p* = 4
[CV] 2019-01-01 → p* = 4
[CV] 2022-01-01 → p* = 4

✅ Pseudo-OOS terminé — n prévisions = 741
               y_hat  y_true  p_used
date                                
1963-12-01 -0.080890     0.0       1
1964-01-01  0.141077    -0.1       1
1964-02-01  0.408114    -0.5       1

📊 Validation 83–89 — n=84 | MAE=0.891 | RMSE=1.248 | R²=-0.995
📊 Test 90–2025 — n=428 | MAE=0.871 | RMSE=1.873 | R²=-0.508
💾 Modèle AR(p) sauvegardé → ARP_last_trained_model.pkl
💾 Bundle AR(p) OOS sauvegardé → ARP_h12_oos_bundle.pkl
💾 Méta AR(p) sauvegardée → ARP_last_trained_model_meta.csv


# AR(1)

In [22]:
# ==========================================
# AR(1) — Pseudo-OOS continu (h=12), p=1 fixe
# réutilise la Series `y` déjà prête (freq=MS)
# ==========================================
import os, pickle, joblib
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ---------- Paramètres ----------
h = 12
min_train_n = 36          # ≥ 3 ans de données pour démarrer
trend = "c"               # "c" = constante ; "n" = sans constante
p_fixed = 1               # <-- p = 1, constant sur tout l'horizon

# ---------- Sécurisation de la série y ----------
# (S'assure qu'on est bien en MS, float et sans trous fatals)
y = pd.Series(y.astype(float).values, index=pd.to_datetime(y.index)).asfreq("MS").dropna()
print(f"y: {y.index.min().date()} → {y.index.max().date()}  (n={len(y)}) | freq={y.index.freqstr}")

# ---------- Boucle pseudo-OOS continue ----------
rows = []
last_model = None
last_fit_end = None

last_t_end = y.index.max() - relativedelta(months=h)

for t_end in y.index:
    if t_end > last_t_end:
        break

    y_tr = y.loc[:t_end]
    if len(y_tr) < max(min_train_n, p_fixed + 1):
        continue

    # fit AR(1) (p fixe) sur les données disponibles jusqu'à t_end
    ar1 = AutoReg(y_tr, lags=p_fixed, old_names=False, trend=trend).fit()
    last_model = ar1
    last_fit_end = t_end

    # prévision à h mois (prendre la valeur à l'horizon)
    fc = ar1.predict(start=len(y_tr), end=len(y_tr) + h - 1)
    yhat_h = float(fc.iloc[-1])

    t_fore = t_end + relativedelta(months=h)
    if t_fore in y.index:
        rows.append((t_fore, yhat_h, float(y.loc[t_fore])))

# ---------- DataFrame OOS ----------
if rows:
    df_oos_ar1 = (
        pd.DataFrame(rows, columns=["date", "y_hat", "y_true"])
          .set_index("date").sort_index()
    )
else:
    df_oos_ar1 = pd.DataFrame(columns=["y_hat", "y_true"])
    df_oos_ar1.index = pd.to_datetime(pd.Index([]))

print(f"\n✅ Pseudo-OOS terminé — n prévisions = {len(df_oos_ar1)}")
print(df_oos_ar1.head(3))

# ---------- (facultatif) Scores par période ----------
if len(df_oos_ar1):
    df_val  = df_oos_ar1.loc["1983-01-01":"1989-12-31"].copy()
    df_test = df_oos_ar1.loc["1990-01-01":"2025-08-31"].copy()

    if len(df_val):
        mae  = mean_absolute_error(df_val["y_true"], df_val["y_hat"])
        rmse = np.sqrt(mean_squared_error(df_val["y_true"], df_val["y_hat"]))
        r2   = r2_score(df_val["y_true"], df_val["y_hat"]) if len(df_val) > 1 else np.nan
        print(f"\n📊 Validation 83–89 — n={len(df_val)} | MAE={mae:.3f} | RMSE={rmse:.3f} | R²={r2:.3f}")

    if len(df_test):
        mae  = mean_absolute_error(df_test["y_true"], df_test["y_hat"])
        rmse = np.sqrt(mean_squared_error(df_test["y_true"], df_test["y_hat"]))
        r2   = r2_score(df_test["y_true"], df_test["y_hat"]) if len(df_test) > 1 else np.nan
        print(f"📊 Test 90–2025 — n={len(df_test)} | MAE={mae:.3f} | RMSE={rmse:.3f} | R²={r2:.3f}")

# ---------- Sauvegardes ----------
AR1_LAST_PKL  = "AR1_last_trained_model.pkl"
AR1_LAST_META = "AR1_last_trained_model_meta.csv"
AR1_BUNDLE    = "AR1_h12_oos_bundle.pkl"

# 1) modèle final
if last_model is not None:
    try:
        joblib.dump(last_model, AR1_LAST_PKL)
        print(f"💾 Modèle AR(1) sauvegardé → {AR1_LAST_PKL}")
    except Exception:
        with open(AR1_LAST_PKL, "wb") as f:
            pickle.dump(last_model, f)
        print(f"💾 Modèle AR(1) sauvegardé (pickle) → {AR1_LAST_PKL}")

# 2) bundle des sorties (dates normalisées en début de mois)
bundle = {
    "oos_predictions": (
        df_oos_ar1.reset_index()
                  .rename(columns={"y_hat": "y_pred"})
                  .assign(date=lambda d: pd.to_datetime(d["date"]).dt.to_period("M").dt.to_timestamp(how="start"))
    ),
    "params": {
        "model": "AR(1)",
        "trend": trend,
        "horizon": h,
        "lag": 1,
        "min_train_n": min_train_n
    },
    "meta": {
        "trained_until": str(last_fit_end.date()) if last_fit_end is not None else None,
        "index_freq": "MS",
        "n_obs_y": int(len(y)),
        "n_forecasts": int(len(df_oos_ar1))
    }
}
with open(AR1_BUNDLE, "wb") as f:
    pickle.dump(bundle, f)
print(f"💾 Bundle AR(1) OOS sauvegardé → {AR1_BUNDLE}")

# 3) méta csv
meta_row = {
    "model": "AR(1)",
    "trend": trend,
    "lag": 1,
    "trained_until": str(last_fit_end.date()) if last_fit_end is not None else None,
    "n_obs_y": int(len(y)),
    "n_forecasts": int(len(df_oos_ar1))
}
pd.DataFrame([meta_row]).to_csv(AR1_LAST_META)
print(f"💾 Méta AR(1) sauvegardée → {AR1_LAST_META}")

y: 1960-01-01 → 2025-08-01  (n=788) | freq=MS

✅ Pseudo-OOS terminé — n prévisions = 741
               y_hat  y_true
date                        
1963-12-01 -0.080890     0.0
1964-01-01  0.141077    -0.1
1964-02-01  0.408114    -0.5

📊 Validation 83–89 — n=84 | MAE=0.970 | RMSE=1.453 | R²=-1.703
📊 Test 90–2025 — n=428 | MAE=0.887 | RMSE=1.866 | R²=-0.496
💾 Modèle AR(1) sauvegardé → AR1_last_trained_model.pkl
💾 Bundle AR(1) OOS sauvegardé → AR1_h12_oos_bundle.pkl
💾 Méta AR(1) sauvegardée → AR1_last_trained_model_meta.csv


# Test Régression linéaire

In [23]:
# =========================================================
# LinearRegression — Pseudo-OOS continu (h=12) comme AR(1)
# - Initialisation (1960→), Validation (1983–1989), Test (1990–2025)
# - Expanding window mensuelle (refit chaque mois)
# - Winsorisation + normalisation apprises sur TRAIN courant
# - Sauvegardes: bundle + méta
# =========================================================
import os, pickle, joblib
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ---------- Paramètres ----------
h = 12
min_train_n = 36           # ≥ 3 ans avant de commencer à prévoir
winsor_level = 0.01        # 1er / 99e percentiles
norm_var = True            # normaliser ou non
target_col = "UNRATE"      # cible dans df_stationary

In [24]:
# ---------- Bornes des fenêtres ----------
eval_start = pd.Timestamp("1983-01-01")
eval_end   = pd.Timestamp("1989-12-31")
test_start = pd.Timestamp("1990-01-01")
test_end   = pd.Timestamp("2025-12-31")   # ajuste si besoin (ex: 2025-08-31)

# ---------- Fichiers de sortie ----------
LINREG_PKL  = "linear_regression.pkl"        # bundle (dict) avec oos_predictions, params, etc.
LINREG_META = "linear_regression_meta.csv"   # méta résumé

In [25]:
# ---------- Préparation df_stationary ----------
def _ensure_ms_index(df):
    """Met l'index au début de mois (MS). Si 'date' existe, on l'utilise comme index."""
    if "date" in df.columns:
        df = df.set_index("date")
    idx = pd.to_datetime(df.index)
    df = df.copy()
    df.index = idx.to_period("M").to_timestamp(how="start")
    return df.asfreq("MS")

# On part de df_stationary (toutes données : 1960→2025)
df_all = _ensure_ms_index(df_stationary).sort_index()

# Vérifs rapides
if target_col not in df_all.columns:
    raise ValueError(f"La colonne cible '{target_col}' est absente de df_stationary.")

# X et y
y_all = df_all[target_col].astype(float)
X_all = df_all.drop(columns=[target_col]).astype(float)
features = list(X_all.columns)

print(f"✅ Données prêtes : {df_all.index.min().date()} → {df_all.index.max().date()} | n={len(df_all)} | freq=MS")
print(f"Features ({len(features)}): {features[:6]}{' ...' if len(features)>6 else ''}")

# ---------- Préproc ----------
def fit_preproc(X, wins=0.01, do_norm=True):
    lower = X.quantile(wins)
    upper = X.quantile(1 - wins)
    Xw = X.clip(lower=lower, upper=upper, axis=1)
    if do_norm:
        mean = Xw.mean()
        std  = Xw.std().replace(0, 1)
        Xn   = (Xw - mean) / std
        prep = {"lower": lower, "upper": upper, "mean": mean, "std": std, "norm": True}
        return Xn, prep
    else:
        prep = {"lower": lower, "upper": upper, "mean": None, "std": None, "norm": False}
        return Xw, prep

def apply_preproc(X, prep):
    Xp = X.clip(lower=prep["lower"], upper=prep["upper"], axis=1)
    if prep["norm"]:
        Xp = (Xp - prep["mean"]) / prep["std"].replace(0, 1)
    return Xp

# ---------- Boucle pseudo-OOS (expanding, refit mensuel) ----------
rows = []                 # (date_forecast, y_hat, y_true)
models = []               # modèles entraînés (pour inspection)
preprocs = []             # objets préproc par fit
train_ends = []           # dernière date de train pour chaque fit

last_t_end = y_all.index.max() - relativedelta(months=h)
last_model = None
last_fit_end = None

for t_end in y_all.index:
    if t_end > last_t_end:
        break

    # TRAIN jusqu'à t_end
    y_tr = y_all.loc[:t_end]
    X_tr = X_all.loc[:t_end]

    if len(y_tr) < min_train_n:
        continue

    # Fit préproc sur TRAIN courant
    X_tr_p, prep = fit_preproc(X_tr, wins=winsor_level, do_norm=norm_var)

    # Fit OLS
    model = LinearRegression()
    model.fit(X_tr_p, y_tr.values)

    # Prévision à l'horizon h => on utilise les features à t_fore
    t_fore = t_end + relativedelta(months=h)
    if t_fore in y_all.index:
        x_fore_raw = X_all.loc[[t_fore]]
        x_fore_p = apply_preproc(x_fore_raw, prep)
        yhat_h = float(model.predict(x_fore_p)[0])

        rows.append((t_fore, yhat_h, float(y_all.loc[t_fore])))

    # trace / stockage
    last_model = model
    last_fit_end = t_end
    models.append(model)
    preprocs.append(prep)
    train_ends.append(t_end)

# ---------- DataFrame OOS ----------
if rows:
    df_oos = (pd.DataFrame(rows, columns=["date", "y_pred", "y_true"])
                .assign(date=lambda d: pd.to_datetime(d["date"]).dt.to_period("M").dt.to_timestamp(how="start"))
                .set_index("date").sort_index())
else:
    df_oos = pd.DataFrame(columns=["y_pred", "y_true"])
    df_oos.index = pd.to_datetime(pd.Index([]))

print(f"\n✅ Pseudo-OOS terminé — n prévisions = {len(df_oos)}")
print(df_oos.head(3))

# ---------- Scores Validation & Test ----------
def _scores(df):
    if len(df) == 0:
        return {"MAE": np.nan, "RMSE": np.nan, "R2": np.nan}
    e = df["y_true"] - df["y_pred"]
    mae  = mean_absolute_error(df["y_true"], df["y_pred"])
    rmse = np.sqrt(mean_squared_error(df["y_true"], df["y_pred"]))
    r2   = r2_score(df["y_true"], df["y_pred"]) if len(df) > 1 else np.nan
    return {"MAE": float(mae), "RMSE": float(rmse), "R2": float(r2)}

df_val  = df_oos.loc[eval_start:eval_end].copy()
df_test = df_oos.loc[test_start:test_end].copy()

sc_val  = _scores(df_val)
sc_test = _scores(df_test)

print(f"\n📊 Validation 83–89 — n={len(df_val)} | MAE={sc_val['MAE']:.3f} | RMSE={sc_val['RMSE']:.3f} | R²={sc_val['R2']:.3f}")
print(f"📊 Test 90–2025 — n={len(df_test)} | MAE={sc_test['MAE']:.3f} | RMSE={sc_test['RMSE']:.3f} | R²={sc_test['R2']:.3f}")

# ---------- Sauvegardes ----------
bundle = {
    "oos_predictions": df_oos.reset_index(),     # (date, y_pred, y_true)
    "params": {
        "model": "LinearRegression",
        "horizon": h,
        "min_train_n": min_train_n,
        "winsor_level": winsor_level,
        "norm_var": norm_var,
        "features": features,
        "eval_window": (str(eval_start.date()), str(eval_end.date())),
        "test_window": (str(test_start.date()), str(test_end.date()))
    },
    "meta": {
        "trained_until": str(last_fit_end.date()) if last_fit_end is not None else None,
        "index_freq": "MS",
        "n_obs_all": int(len(df_all)),
        "n_forecasts": int(len(df_oos))
    },
    "train_fit_dates": pd.to_datetime(pd.Index(train_ends))  # pour inspection
}

with open(LINREG_PKL, "wb") as f:
    pickle.dump(bundle, f)
pd.DataFrame([{
    "model": "LinearRegression",
    "horizon": h,
    "min_train_n": min_train_n,
    "winsor_level": winsor_level,
    "norm_var": norm_var,
    "trained_until": bundle["meta"]["trained_until"],
    "n_forecasts": bundle["meta"]["n_forecasts"]
}]).to_csv(LINREG_META)

print(f"\n💾 Bundle sauvegardé → {LINREG_PKL}")
print(f"💾 Méta sauvegardée → {LINREG_META}")

✅ Données prêtes : 1960-01-01 → 2025-08-01 | n=788 | freq=MS
Features (10): ['TB3MS', 'RPI', 'INDPRO', 'DPCERA3M086SBEA', 'S&P 500', 'BUSLOANS'] ...

✅ Pseudo-OOS terminé — n prévisions = 741
              y_pred  y_true
date                        
1963-12-01 -0.354113     0.0
1964-01-01 -0.282896    -0.1
1964-02-01  1.105841    -0.5

📊 Validation 83–89 — n=84 | MAE=0.810 | RMSE=1.018 | R²=-0.328
📊 Test 90–2025 — n=428 | MAE=0.827 | RMSE=1.460 | R²=0.084

💾 Bundle sauvegardé → linear_regression.pkl
💾 Méta sauvegardée → linear_regression_meta.csv


# Ridge

In [12]:
# ===============================================
# RIDGE — Pseudo-OOS (h=12) + Bagging (à la BoE)
# - Refit annuel, retune alpha tous les 36 mois (dès 1983), scoring=MAE (hv-block CV, gap=12)
# - Winsorisation 1%/99% + normalisation apprises sur TRAIN (et par bootstrap pour chaque modèle)
# - Sauvegardes: bundle OOS + artefact "dernier modèle"
# ===============================================
import os, pickle, json
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

In [None]:
# ---------- Fichiers ----------

# ---------- Paramètres ----------
h = 12
min_train_n = 36                 # ≥ 3 ans avant de commencer
winsor_level = 0.01
norm_var = True
target_col = "UNRATE"

# Bagging (comme article : 30 modèles bootstrap)
n_boot = 30
bootstrap_proportion = 1.0       # (proportion d'échantillon bootstrap)
seed0 = 12345

# Fenêtres
eval_start = pd.Timestamp("1983-01-01")
eval_end   = pd.Timestamp("1989-12-31")
test_start = pd.Timestamp("1990-01-01")
test_end   = pd.Timestamp("2025-12-31")

# Cadences (article : refit=12 mois, retune=36 mois)
refit_every_months = 12
retune_every_months = 36

In [14]:
# ---------- Préparation df_stationary ----------
def _ensure_ms_index(df: pd.DataFrame) -> pd.DataFrame:
    """Index mensuel (MS). Si 'date' existe, on l'utilise comme index."""
    df = df.copy()
    if "date" in df.columns:
        df = df.set_index("date")
    idx = pd.to_datetime(df.index)
    df.index = idx.to_period("M").to_timestamp(how="start")
    return df.asfreq("MS")

# ⚠️ On suppose que df_stationary est déjà en mémoire (toutes séries transformées)
df_all = _ensure_ms_index(df_stationary).sort_index()

if target_col not in df_all.columns:
    raise ValueError(f"La colonne cible '{target_col}' est absente de df_stationary.")

y_all = df_all[target_col].astype(float)
X_all = df_all.drop(columns=[target_col]).astype(float)
features = list(X_all.columns)

print(f"✅ Données prêtes : {df_all.index.min().date()} → {df_all.index.max().date()} | n={len(df_all)} | freq=MS")
print(f"Features ({len(features)}): {features[:6]}{' ...' if len(features)>6 else ''}")

# ---------- Préproc ----------
def fit_preproc(X: pd.DataFrame, wins=0.01, do_norm=True):
    lower = X.quantile(wins)
    upper = X.quantile(1 - wins)
    Xw = X.clip(lower=lower, upper=upper, axis=1)
    if do_norm:
        mean = Xw.mean()
        std  = Xw.std().replace(0, 1)
        Xn   = (Xw - mean) / std
        prep = {"lower": lower, "upper": upper, "mean": mean, "std": std, "norm": True}
        return Xn, prep
    else:
        prep = {"lower": lower, "upper": upper, "mean": None, "std": None, "norm": False}
        return Xw, prep

def apply_preproc(X: pd.DataFrame, prep: dict):
    Xp = X.clip(lower=prep["lower"], upper=prep["upper"], axis=1)
    if prep["norm"]:
        Xp = (Xp - prep["mean"]) / prep["std"].replace(0, 1)
    return Xp

# ---------- hv-block CV (5 blocs consécutifs, gap=12 autour du test) ----------
class HVBlockCV:
    def __init__(self, n_splits=5, gap=12):
        self.n_splits = n_splits
        self.gap = gap
    def split(self, X, y=None, groups=None):
        n = len(X)
        fold_sizes = np.full(self.n_splits, n // self.n_splits, dtype=int)
        fold_sizes[: n % self.n_splits] += 1
        idx = np.arange(n)
        cur = 0
        for fs in fold_sizes:
            start, stop = cur, cur + fs
            test_idx = idx[start:stop]
            train_mask = np.ones(n, dtype=bool)
            left = max(0, start - self.gap)
            right = min(n, stop + self.gap)
            train_mask[left:right] = False
            train_idx = idx[train_mask]
            cur = stop
            if len(train_idx) == 0 or len(test_idx) == 0:
                continue
            yield train_idx, test_idx
    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

# ---------- Grille d'alpha ----------
alpha_grid = np.logspace(-5, 4, 100)

def tune_ridge_alpha(X_tr_p: pd.DataFrame, y_tr: np.ndarray):
    """Tuning d'alpha par hv-block CV (gap=12), scoring=MAE. Retourne (best_alpha, best_mae, best_estimator)."""
    cv = HVBlockCV(n_splits=5, gap=12)
    gs = GridSearchCV(
        Ridge(fit_intercept=True),
        param_grid={"alpha": alpha_grid},
        scoring="neg_mean_absolute_error",
        cv=cv,
        n_jobs=-1,
        refit=True,
        verbose=0
    )
    gs.fit(X_tr_p, y_tr)
    return float(gs.best_params_["alpha"]), float(-gs.best_score_), gs.best_estimator_

# ---------- Bootstrap util ----------
def bootstrap_indices(n, proportion=1.0, seed=None):
    m = int(round(n * proportion))
    rng_local = np.random.default_rng(seed)
    return rng_local.integers(0, n, size=m, endpoint=False)

# ---------- Boucle pseudo-OOS ----------
rows = []                    # (date_forecast, y_pred_mean, y_true, y_pred_std)
train_ends = []              # dates de refit
alpha_history = []           # α utilisé à chaque refit
cv_mae_history = []          # MAE CV à chaque retune

# Artefact "dernier modèle" (ensemble bootstrap du dernier refit)
last_boot_models = []        # liste de dicts: {"prep": prep_dict, "alpha": float, "coef": np.ndarray, "intercept": float}
last_fit_end = None
last_best_alpha = None

last_t_end = y_all.index.max() - relativedelta(months=h)
last_refit_t = None
last_tune_t  = None
best_alpha   = 1.0
boot_seed    = seed0

boot_models = []  # liste courante [(prep_b, mdl_b)]

for t_end in y_all.index:
    if t_end > last_t_end:
        break

    y_tr = y_all.loc[:t_end]
    X_tr = X_all.loc[:t_end]
    n_tr = len(y_tr)
    if n_tr < min_train_n:
        continue

    # cadence refit (tous les 12 mois)
    if last_refit_t is None:
        need_refit = True
    else:
        months_since_refit = (t_end.year - last_refit_t.year)*12 + (t_end.month - last_refit_t.month)
        need_refit = months_since_refit >= refit_every_months

    # cadence retune (≥ 1983 puis tous les 36 mois)
    need_tune = False
    if t_end >= eval_start:
        if last_tune_t is None:
            need_tune = True
        else:
            months_since_tune = (t_end.year - last_tune_t.year)*12 + (t_end.month - last_tune_t.month)
            need_tune = months_since_tune >= retune_every_months

    if need_refit:
        # préproc global (pour tuning)
        X_tr_p_global, _ = fit_preproc(X_tr, wins=winsor_level, do_norm=norm_var)

        # tuning alpha si demandé
        if need_tune:
            best_alpha, best_cv_mae, _ = tune_ridge_alpha(X_tr_p_global, y_tr.values)
            last_tune_t = t_end
            cv_mae_history.append(best_cv_mae)
        else:
            cv_mae_history.append(np.nan)

        # bagging : n_boot modèles bootstrap (préproc appris par bootstrap)
        boot_models = []
        for b in range(n_boot):
            b_seed = boot_seed + b
            ix = bootstrap_indices(n_tr, proportion=bootstrap_proportion, seed=b_seed)
            Xb_raw = X_tr.iloc[ix]
            yb = y_tr.iloc[ix].values

            Xb_p, prep_b = fit_preproc(Xb_raw, wins=winsor_level, do_norm=norm_var)
            mdl_b = Ridge(alpha=best_alpha, fit_intercept=True)
            mdl_b.fit(Xb_p, yb)

            boot_models.append((prep_b, mdl_b))

        # mémoires pour OOS + artefact
        last_refit_t = t_end
        train_ends.append(t_end)
        alpha_history.append(best_alpha)
        boot_seed += 9973

        # artefact "dernier modèle"
        last_boot_models = []
        for prep_b, mdl_b in boot_models:
            last_boot_models.append({
                "prep": {
                    "lower": prep_b["lower"].to_dict(),
                    "upper": prep_b["upper"].to_dict(),
                    "mean":  prep_b["mean"].to_dict() if prep_b["norm"] else None,
                    "std":   prep_b["std"].to_dict() if prep_b["norm"] else None,
                    "norm":  prep_b["norm"]
                },
                "alpha": float(mdl_b.alpha),
                "coef":  mdl_b.coef_.tolist(),
                "intercept": float(mdl_b.intercept_)
            })
        last_fit_end = t_end
        last_best_alpha = best_alpha

    # prévision h=12 si un ensemble est dispo
    if not boot_models:
        continue

    t_fore = t_end + relativedelta(months=h)
    if t_fore in y_all.index:
        x_fore_raw = X_all.loc[[t_fore]]
        preds_b = []
        for prep_b, mdl_b in boot_models:
            x_fore_p = apply_preproc(x_fore_raw, prep_b)
            preds_b.append(float(mdl_b.predict(x_fore_p)[0]))
        yhat_mean = float(np.mean(preds_b))
        yhat_std  = float(np.std(preds_b, ddof=1)) if len(preds_b) > 1 else 0.0
        rows.append((t_fore, yhat_mean, float(y_all.loc[t_fore]), yhat_std))

# ---------- DataFrame OOS ----------
if rows:
    df_oos = (pd.DataFrame(rows, columns=["date","y_pred","y_true","y_pred_std"])
              .assign(date=lambda d: pd.to_datetime(d["date"]).dt.to_period("M").dt.to_timestamp(how="start"))
              .set_index("date").sort_index())
else:
    df_oos = pd.DataFrame(columns=["y_pred","y_true","y_pred_std"])
    df_oos.index = pd.to_datetime(pd.Index([]))

print(f"\n✅ Pseudo-OOS Ridge+Bagging (h=12) terminé — n prévisions = {len(df_oos)}")
print(df_oos.head(3))

# ---------- Scores ----------
def _scores(df):
    if len(df) == 0:
        return {"MAE": np.nan, "RMSE": np.nan, "R2": np.nan}
    mae  = mean_absolute_error(df["y_true"], df["y_pred"])
    rmse = np.sqrt(mean_squared_error(df["y_true"], df["y_pred"]))
    r2   = r2_score(df["y_true"], df["y_pred"]) if len(df) > 1 else np.nan
    return {"MAE": float(mae), "RMSE": float(rmse), "R2": float(r2)}

df_val  = df_oos.loc[eval_start:eval_end].copy()
df_test = df_oos.loc[test_start:test_end].copy()

sc_val  = _scores(df_val)
sc_test = _scores(df_test)

print(f"\n📊 Validation 83–89 — n={len(df_val)} | MAE={sc_val['MAE']:.3f} | RMSE={sc_val['RMSE']:.3f} | R²={sc_val['R2']:.3f}")
print(f"📊 Test 90–2025 — n={len(df_test)} | MAE={sc_test['MAE']:.3f} | RMSE={sc_test['RMSE']:.3f} | R²={sc_test['R2']:.3f}")

# ---------- Sauvegardes ----------
# 1) Bundle OOS
bundle = {
    "oos_predictions": df_oos.reset_index(),
    "params": {
        "model": "Ridge + Bagging",
        "n_boot": n_boot,
        "bootstrap_proportion": bootstrap_proportion,
        "alpha_schedule": "hv-block CV (5 folds, gap=12), MAE, every 36 months since 1983",
        "horizon": h,
        "min_train_n": min_train_n,
        "winsor_level": winsor_level,
        "norm_var": norm_var,
        "features": features,
        "eval_window": (str(eval_start.date()), str(eval_end.date())),
        "test_window": (str(test_start.date()), str(test_end.date())),
        "refit_every_months": refit_every_months,
        "retune_every_months": retune_every_months
    },
    "meta": {
        "trained_until": str(train_ends[-1].date()) if len(train_ends) else None,
        "index_freq": "MS",
        "n_obs_all": int(len(df_all)),
        "n_forecasts": int(len(df_oos)),
        "alpha_history": alpha_history,
        "cv_mae_history": cv_mae_history
    },
    "train_fit_dates": pd.to_datetime(pd.Index(train_ends))
}
with open(RIDGE_BUNDLE, "wb") as f:
    pickle.dump(bundle, f)
pd.DataFrame([{
    "model": "Ridge+Bagging",
    "n_boot": n_boot,
    "horizon": h,
    "min_train_n": min_train_n,
    "winsor_level": winsor_level,
    "norm_var": norm_var,
    "refit_every_months": refit_every_months,
    "retune_every_months": retune_every_months,
    "trained_until": bundle["meta"]["trained_until"],
    "n_forecasts": bundle["meta"]["n_forecasts"]
}]).to_csv(RIDGE_META, index=False)

# 2) Artefact dernier modèle (ensemble bootstrap)
ridge_artifact = {
    "trained_until": str(last_fit_end.date()) if last_fit_end is not None else None,
    "horizon": h,
    "best_alpha": float(last_best_alpha) if last_best_alpha is not None else None,
    "features": features,
    "n_boot": n_boot,
    "models": last_boot_models  # liste de {prep, alpha, coef, intercept}
}
with open(RIDGE_LAST_PKL, "wb") as f:
    pickle.dump(ridge_artifact, f)
pd.DataFrame([{
    "trained_until": ridge_artifact["trained_until"],
    "best_alpha": ridge_artifact["best_alpha"],
    "n_features": len(features),
    "horizon": h,
    "n_boot": n_boot
}]).to_csv(RIDGE_LAST_META, index=False)

print(f"\n💾 Bundle OOS sauvegardé → {RIDGE_BUNDLE}")
print(f"💾 Méta bundle       → {RIDGE_META}")
print(f"💾 Dernier modèle    → {RIDGE_LAST_PKL}")
print(f"💾 Méta dernier fit  → {RIDGE_LAST_META}")

✅ Données prêtes : 1960-01-01 → 2025-08-01 | n=788 | freq=MS
Features (10): ['TB3MS', 'RPI', 'INDPRO', 'DPCERA3M086SBEA', 'S&P 500', 'BUSLOANS'] ...

✅ Pseudo-OOS Ridge+Bagging (h=12) terminé — n prévisions = 741
              y_pred  y_true  y_pred_std
date                                    
1963-12-01 -0.254922     0.0    0.320833
1964-01-01 -0.038495    -0.1    0.357014
1964-02-01  1.302823    -0.5    0.675225

📊 Validation 83–89 — n=84 | MAE=0.802 | RMSE=1.001 | R²=-0.282
📊 Test 90–2025 — n=428 | MAE=0.825 | RMSE=1.471 | R²=0.071

💾 Bundle OOS sauvegardé → ridge_regression.pkl
💾 Méta bundle       → ridge_regression_meta.csv
💾 Dernier modèle    → RIDGE_last_trained_model.pkl
💾 Méta dernier fit  → RIDGE_last_trained_model_meta.csv


# LightGBM

In [6]:
# ===============================================
# LIGHTGBM — Pseudo-OOS (h=12) + Bagging (à la BoE)
# - Refit annuel, retune hyperparams tous les 36 mois (dès 1983), scoring=MAE (hv-block CV, gap=12)
# - Winsorisation 1%/99% + normalisation apprises sur TRAIN (et par bootstrap pour chaque modèle)
# - Sauvegardes: bundle OOS + artefact "dernier modèle"
# ===============================================
import os, pickle, json
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from lightgbm import LGBMRegressor

# ---------- Fichiers ----------
LGBM_BUNDLE     = "lightgbm_regression.pkl"              # bundle OOS
LGBM_META       = "lightgbm_regression_meta.csv"         # méta du bundle
LGBM_LAST_PKL   = "LGBM_last_trained_model.pkl"          # artefact dernier ensemble (liste de (prep, model))
LGBM_LAST_META  = "LGBM_last_trained_model_meta.csv"     # méta dernier fit

# ---------- Paramètres ----------
h = 12
min_train_n = 36                 # ≥ 3 ans avant de commencer
winsor_level = 0.01
norm_var = True
target_col = "UNRATE"

# Bagging (comme article : 30 modèles bootstrap)
n_boot = 30
bootstrap_proportion = 1.0
seed0 = 12345

# Fenêtres
eval_start = pd.Timestamp("1983-01-01")
eval_end   = pd.Timestamp("1989-12-31")
test_start = pd.Timestamp("1990-01-01")
# ajuste selon ton jeu (2019-11-30 si tu veux coller au papier)
test_end   = pd.Timestamp("2025-12-31")

# Cadences
refit_every_months = 12
retune_every_months = 36

# ---------- Préparation df_stationary ----------
def _ensure_ms_index(df: pd.DataFrame) -> pd.DataFrame:
    """Index mensuel (MS). Si 'date' existe, on l'utilise comme index."""
    df = df.copy()
    if "date" in df.columns:
        df = df.set_index("date")
    idx = pd.to_datetime(df.index)
    df.index = idx.to_period("M").to_timestamp(how="start")
    return df.asfreq("MS")

# ⚠️ On suppose que df_stationary est déjà en mémoire (toutes séries transformées)
df_all = _ensure_ms_index(df_stationary).sort_index()

if target_col not in df_all.columns:
    raise ValueError(f"La colonne cible '{target_col}' est absente de df_stationary.")

y_all = df_all[target_col].astype(float)
X_all = df_all.drop(columns=[target_col]).astype(float)
# noms de features sans espaces (LightGBM warning → underscores)
X_all.columns = [str(c).replace(" ", "_") for c in X_all.columns]
features = list(X_all.columns)

print(f"✅ Données prêtes : {df_all.index.min().date()} → {df_all.index.max().date()} | n={len(df_all)} | freq=MS")
print(f"Features ({len(features)}): {features[:6]}{' ...' if len(features)>6 else ''}")

# ---------- Préproc ----------
def fit_preproc(X: pd.DataFrame, wins=0.01, do_norm=True):
    lower = X.quantile(wins)
    upper = X.quantile(1 - wins)
    Xw = X.clip(lower=lower, upper=upper, axis=1)
    if do_norm:
        mean = Xw.mean()
        std  = Xw.std().replace(0, 1)
        Xn   = (Xw - mean) / std
        prep = {"lower": lower, "upper": upper, "mean": mean, "std": std, "norm": True}
        return Xn, prep
    else:
        prep = {"lower": lower, "upper": upper, "mean": None, "std": None, "norm": False}
        return Xw, prep

def apply_preproc(X: pd.DataFrame, prep: dict):
    Xp = X.clip(lower=prep["lower"], upper=prep["upper"], axis=1)
    if prep["norm"]:
        Xp = (Xp - prep["mean"]) / prep["std"].replace(0, 1)
    return Xp

# ---------- hv-block CV (5 blocs consécutifs, gap=12 autour du test) ----------
class HVBlockCV:
    def __init__(self, n_splits=5, gap=12):
        self.n_splits = n_splits
        self.gap = gap
    def split(self, X, y=None, groups=None):
        n = len(X)
        fold_sizes = np.full(self.n_splits, n // self.n_splits, dtype=int)
        fold_sizes[: n % self.n_splits] += 1
        idx = np.arange(n)
        cur = 0
        for fs in fold_sizes:
            start, stop = cur, cur + fs
            test_idx = idx[start:stop]
            train_mask = np.ones(n, dtype=bool)
            left = max(0, start - self.gap)
            right = min(n, stop + self.gap)
            train_mask[left:right] = False
            train_idx = idx[train_mask]
            cur = stop
            if len(train_idx) == 0 or len(test_idx) == 0:
                continue
            yield train_idx, test_idx
    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

# ---------- Grille LightGBM (proche repo) ----------
# Ajout de garde-fous pour éviter "no leaves meet split requirements"
param_dist = {
    "subsample":        [0.05,.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0],
    "colsample_bytree": [.2,.3,.4,.5,.6,.7,1.0],
    "num_leaves":       [2,3,4,5,8,10,20,40,70,100],
    "n_estimators":     [5,10,20,30,40,50,75,100],  # éviter 1/3 qui dégénèrent souvent
    "max_depth":        [1,2,3,5,8,15,-1],          # -1 = illimité
    "reg_alpha":        [0, .1, 1, 2, 7, 10, 50, 100],
    "reg_lambda":       [0, .1, 1, 10, 20, 50, 100],
    "min_child_samples":[5,10,15],                  # NEW: stabilité
    "min_split_gain":   [0.0, 0.01, 0.05],          # NEW: stabilité
}

def tune_lgbm(X_tr_p: pd.DataFrame, y_tr: np.ndarray, seed=None):
    cv = HVBlockCV(n_splits=5, gap=12)
    base = LGBMRegressor(
        boosting_type="gbdt",
        n_jobs=1,
        random_state=seed,
        objective="regression",
        # paramètres fixes sûrs
        importance_type="gain",
        verbose=-1
    )
    rs = RandomizedSearchCV(
        estimator=base,
        param_distributions=param_dist,
        n_iter=100,
        scoring="neg_mean_absolute_error",
        cv=cv,
        n_jobs=-1,
        random_state=seed,
        refit=True,
        verbose=0
    )
    rs.fit(X_tr_p, y_tr)
    return rs.best_params_, float(-rs.best_score_), rs.best_estimator_

# ---------- Bootstrap util ----------
def bootstrap_indices(n, proportion=1.0, seed=None):
    m = int(round(n * proportion))
    rng_local = np.random.default_rng(seed)
    return rng_local.integers(0, n, size=m, endpoint=False)

def _to_month_start(ts):
    return pd.Timestamp(ts).to_period("M").to_timestamp(how="start") if ts is not None else None

# ---------- Boucle pseudo-OOS ----------
rows = []                    # (date_forecast, y_pred_mean, y_true, y_pred_std)
train_ends = []              # dates de refit
cv_mae_history = []          # MAE CV à chaque retune
best_params_hist = []        # pour info

# Artefact "dernier modèle"
last_boot_models = []        # liste de dicts: {"prep": prep_dict, "params": dict}
last_fit_end = None
last_best_params = None

last_t_end = y_all.index.max() - relativedelta(months=h)
last_refit_t = None
last_tune_t  = None
best_params  = {}            # params LGBM courants
boot_seed    = seed0

boot_models = []  # liste courante [(prep_b, mdl_b)]

for t_end in y_all.index:
    if t_end > last_t_end:
        break

    y_tr = y_all.loc[:t_end]
    X_tr = X_all.loc[:t_end]
    n_tr = len(y_tr)
    if n_tr < min_train_n:
        continue

    # cadence refit (tous les 12 mois)
    if last_refit_t is None:
        need_refit = True
    else:
        months_since_refit = (t_end.year - last_refit_t.year)*12 + (t_end.month - last_refit_t.month)
        need_refit = months_since_refit >= refit_every_months

    # cadence retune (≥ 1983 puis tous les 36 mois)
    need_tune = False
    if t_end >= eval_start:
        if last_tune_t is None:
            need_tune = True
        else:
            months_since_tune = (t_end.year - last_tune_t.year)*12 + (t_end.month - last_tune_t.month)
            need_tune = months_since_tune >= retune_every_months

    if need_refit:
        # préproc global (pour tuning)
        X_tr_p_global, _ = fit_preproc(X_tr, wins=winsor_level, do_norm=norm_var)

        # tuning hyperparams si demandé
        if need_tune:
            best_params, best_cv_mae, _ = tune_lgbm(X_tr_p_global, y_tr.values, seed=seed0)
            last_tune_t = t_end
            cv_mae_history.append(best_cv_mae)
            best_params_hist.append(best_params.copy())
        else:
            cv_mae_history.append(np.nan)
            best_params_hist.append(best_params.copy() if best_params else {})

        # bagging : n_boot modèles bootstrap (préproc appris par bootstrap)
        boot_models = []
        for b in range(n_boot):
            b_seed = boot_seed + b
            ix = bootstrap_indices(n_tr, proportion=bootstrap_proportion, seed=b_seed)
            Xb_raw = X_tr.iloc[ix]
            yb = y_tr.iloc[ix].values

            Xb_p, prep_b = fit_preproc(Xb_raw, wins=winsor_level, do_norm=norm_var)

            mdl_b = LGBMRegressor(
                boosting_type="gbdt",
                n_jobs=1,
                random_state=b_seed,
                objective="regression",
                importance_type="gain",
                verbose=-1,
                **best_params
            )
            mdl_b.fit(Xb_p, yb)
            boot_models.append((prep_b, mdl_b))

        # mémoires pour OOS + artefact
        last_refit_t = t_end
        train_ends.append(t_end)
        boot_seed += 9973

        # artefact "dernier modèle"
        last_boot_models = []
        for prep_b, mdl_b in boot_models:
            last_boot_models.append({
                "prep": {
                    "lower": prep_b["lower"].to_dict(),
                    "upper": prep_b["upper"].to_dict(),
                    "mean":  prep_b["mean"].to_dict() if prep_b["norm"] else None,
                    "std":   prep_b["std"].to_dict() if prep_b["norm"] else None,
                    "norm":  prep_b["norm"]
                },
                "params": mdl_b.get_params()  # hyperparams retenus
            })
        last_fit_end = t_end
        last_best_params = best_params.copy()

    # prévision h=12 si un ensemble est dispo
    if not boot_models:
        continue

    t_fore = t_end + relativedelta(months=h)
    if t_fore in y_all.index:
        x_fore_raw = X_all.loc[[t_fore]]
        preds_b = []
        for prep_b, mdl_b in boot_models:
            x_fore_p = apply_preproc(x_fore_raw, prep_b)
            preds_b.append(float(mdl_b.predict(x_fore_p)[0]))
        yhat_mean = float(np.mean(preds_b))
        yhat_std  = float(np.std(preds_b, ddof=1)) if len(preds_b) > 1 else 0.0
        rows.append((t_fore, yhat_mean, float(y_all.loc[t_fore]), yhat_std))

# ---------- DataFrame OOS ----------
if rows:
    df_oos = (pd.DataFrame(rows, columns=["date","y_pred","y_true","y_pred_std"])
              .assign(date=lambda d: pd.to_datetime(d["date"]).dt.to_period("M").dt.to_timestamp(how="start"))
              .set_index("date").sort_index())
else:
    df_oos = pd.DataFrame(columns=["y_pred","y_true","y_pred_std"])
    df_oos.index = pd.to_datetime(pd.Index([]))

print(f"\n✅ Pseudo-OOS LightGBM+Bagging (h=12) terminé — n prévisions = {len(df_oos)}")
print(df_oos.head(3))

# ---------- Scores ----------
def _scores(df):
    if len(df) == 0:
        return {"MAE": np.nan, "RMSE": np.nan, "R2": np.nan}
    mae  = mean_absolute_error(df["y_true"], df["y_pred"])
    rmse = np.sqrt(mean_squared_error(df["y_true"], df["y_pred"]))
    ssr  = np.sum((df["y_true"]-df["y_pred"])**2)
    sst  = np.sum((df["y_true"]-df["y_true"].mean())**2)
    r2   = 1 - ssr/sst if sst>0 else np.nan
    return {"MAE": float(mae), "RMSE": float(rmse), "R2": float(r2)}

df_val  = df_oos.loc[eval_start:eval_end].copy()
df_test = df_oos.loc[test_start:test_end].copy()

sc_val  = _scores(df_val)
sc_test = _scores(df_test)

print(f"\n📊 Validation 83–89 — n={len(df_val)} | MAE={sc_val['MAE']:.3f} | RMSE={sc_val['RMSE']:.3f} | R²={sc_val['R2']:.3f}")
print(f"📊 Test 90–{test_end.year} — n={len(df_test)} | MAE={sc_test['MAE']:.3f} | RMSE={sc_test['RMSE']:.3f} | R²={sc_test['R2']:.3f}")

# ---------- Sauvegardes ----------
bundle = {
    "oos_predictions": df_oos.reset_index(),
    "params": {
        "model": "LightGBM + Bagging",
        "n_boot": n_boot,
        "bootstrap_proportion": bootstrap_proportion,
        "hyper_search": "hv-block CV (5 folds, gap=12), MAE, every 36 months since 1983, RandomizedSearch 100 iters",
        "horizon": h,
        "min_train_n": min_train_n,
        "winsor_level": winsor_level,
        "norm_var": norm_var,
        "features": features,
        "eval_window": (str(eval_start.date()), str(eval_end.date())),
        "test_window": (str(test_start.date()), str(test_end.date())),
        "refit_every_months": refit_every_months,
        "retune_every_months": retune_every_months,
        "best_params_last": last_best_params
    },
    "meta": {
        "trained_until": str(_to_month_start(last_refit_t)) if last_refit_t is not None else None,
        "last_tune_time": str(_to_month_start(last_tune_t)) if last_tune_t is not None else None,
        "index_freq": "MS",
        "n_obs_all": int(len(df_all)),
        "n_forecasts": int(len(df_oos)),
        "cv_mae_history": cv_mae_history,
        "best_params_history": best_params_hist
    },
    "train_fit_dates": pd.to_datetime(pd.Index(train_ends))
}
with open(LGBM_BUNDLE, "wb") as f:
    pickle.dump(bundle, f)

pd.DataFrame([{
    "model": "LightGBM+Bagging",
    "horizon": h,
    "min_train_n": min_train_n,
    "winsor_level": winsor_level,
    "norm_var": norm_var,
    "refit_every_months": refit_every_months,
    "retune_every_months": retune_every_months,
    "n_boot": n_boot,
    "bootstrap_proportion": bootstrap_proportion,
    "trained_until": bundle["meta"]["trained_until"],
    "n_forecasts": bundle["meta"]["n_forecasts"]
}]).to_csv(LGBM_META, index=False)

# Artefact dernier modèle (ensemble bootstrap)
lgbm_artifact = {
    "trained_until": str(_to_month_start(last_fit_end)) if last_fit_end is not None else None,
    "horizon": h,
    "features": features,
    "n_boot": n_boot,
    "best_params": last_best_params,
    "models": last_boot_models  # liste de {prep, params}
}
with open(LGBM_LAST_PKL, "wb") as f:
    pickle.dump(lgbm_artifact, f)

pd.DataFrame([{
    "trained_until": lgbm_artifact["trained_until"],
    "n_features": len(features),
    "horizon": h,
    "n_boot": n_boot
}]).to_csv(LGBM_LAST_META, index=False)

print(f"\n💾 Bundle OOS sauvegardé → {LGBM_BUNDLE}")
print(f"💾 Méta bundle       → {LGBM_META}")
print(f"💾 Dernier modèle    → {LGBM_LAST_PKL}")
print(f"💾 Méta dernier fit  → {LGBM_LAST_META}")

✅ Données prêtes : 1960-01-01 → 2025-08-01 | n=788 | freq=MS
Features (10): ['TB3MS', 'RPI', 'INDPRO', 'DPCERA3M086SBEA', 'S&P_500', 'BUSLOANS'] ...

✅ Pseudo-OOS LightGBM+Bagging (h=12) terminé — n prévisions = 741
              y_pred  y_true  y_pred_std
date                                    
1963-12-01  0.097778     0.0     0.20658
1964-01-01  0.097778    -0.1     0.20658
1964-02-01  0.097778    -0.5     0.20658

📊 Validation 83–89 — n=84 | MAE=0.798 | RMSE=1.021 | R²=-0.336
📊 Test 90–2025 — n=428 | MAE=0.782 | RMSE=1.385 | R²=0.176

💾 Bundle OOS sauvegardé → lightgbm_regression.pkl
💾 Méta bundle       → lightgbm_regression_meta.csv
💾 Dernier modèle    → LGBM_last_trained_model.pkl
💾 Méta dernier fit  → LGBM_last_trained_model_meta.csv
