# Package

In [16]:
import pandas as pd
import os, pickle, joblib
import numpy as np
from dateutil.relativedelta import relativedelta
from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression

# Importation des donn√©es

In [17]:
df_stationary = pd.read_csv("df_stationary.csv", index_col="date")
df_stationary_unrate = df_stationary["UNRATE"]
y = df_stationary_unrate.copy()

In [18]:
df_stationary

Unnamed: 0_level_0,UNRATE,TB3MS,RPI,INDPRO,DPCERA3M086SBEA,S&P500,BUSLOANS,CPIAUCSL,OILPRICEx,M2SL,USREC
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1960-01-31,-0.8,0.30,0.020977,0.091980,0.001204,0.017909,0.011578,-0.006156,0.000000,0.001323,0
1960-02-29,-1.1,-0.19,0.014565,0.076964,0.006009,-0.025663,0.011905,-0.003767,0.000000,0.002007,0
1960-03-31,-0.2,-1.18,0.006250,0.007961,0.021240,-0.070857,-0.008356,-0.005455,0.000000,0.001324,0
1960-04-30,0.0,-1.12,0.006489,-0.025915,0.033752,-0.040442,-0.009098,0.005090,0.000000,0.000634,1
1960-05-31,0.0,-0.67,0.007747,-0.018121,0.009040,-0.010090,-0.000359,0.003383,0.000000,0.003977,1
...,...,...,...,...,...,...,...,...,...,...,...
2025-04-30,0.3,0.00,0.015108,0.007185,0.007959,-0.107606,0.046085,-0.007236,-0.226416,0.004839,0
2025-05-31,0.2,0.03,0.008190,-0.002058,0.007758,-0.038448,0.044487,-0.007941,-0.162581,0.006395,0
2025-06-30,0.0,0.03,0.001403,0.006457,0.002578,0.059087,0.035681,-0.000435,0.026151,0.006741,0
2025-07-31,0.0,0.04,-0.003535,0.001899,0.005014,0.159259,-0.000974,0.001775,0.249194,0.000564,0


# Pr√©paration des donn√©es

In [19]:
# V√©rifie que l‚Äôindex est bien une date (sinon essaie de le convertir)
if not isinstance(y.index, (pd.DatetimeIndex, pd.PeriodIndex)):
    y.index = pd.to_datetime(y.index, errors="coerce")

# am√©nager la fr√©quence mensuelle (d√©but de mois)
y.index = y.index.to_period("M").to_timestamp(how="start")
y = y.sort_index().asfreq("MS").astype(float).dropna()

print(f"‚úÖ S√©rie pr√™te : {y.index.min().date()} ‚Üí {y.index.max().date()} | n={len(y)} | freq={y.index.freqstr}")

‚úÖ S√©rie pr√™te : 1960-01-01 ‚Üí 2025-08-01 | n=788 | freq=MS


# 1- Mod√®le AutoRegression 1

In [39]:
# ==========================================
# AR(1) ‚Äî Pseudo-OOS continu (h=12), p=1 fixe + BAGGING (bootstrap en blocs)
# ==========================================
# ---------- Param√®tres ----------
h = 12
min_train_n = 36
trend = "c"
p_fixed = 1

# ---------- Nouveaux param√®tres bagging ----------
use_bagging = True      # ‚Üê interrupteur ON/OFF
B_boot = 30            # nb de r√©-√©chantillonnages
L_block = 12            # taille de bloc (mois) pour moving-block bootstrap
rng = np.random.default_rng(123)  # seed bootstrap

In [40]:
# ---------- Utilitaires bootstrap ----------
def moving_block_bootstrap(arr, L, rng):
    """Concat√®ne des blocs contigus de taille L tir√©s al√©atoirement jusqu'√† longueur n."""
    n = len(arr)
    if L <= 0 or L > n:
        raise ValueError("L_block invalide")
    nb = int(np.ceil(n / L))
    starts = rng.integers(0, n - L + 1, size=nb)
    out = np.concatenate([arr[s:s+L] for s in starts])[:n]
    return out

def bagged_h_forecast_AR1(y_tr, h, trend, B, L, rng):
    """
    Pr√©vision √† horizon h par bagging (residual moving-block bootstrap) pour AR(1).
    Retourne (yhat_mean, yhat_dist, base_pred)
    """
    base_model = AutoReg(y_tr, lags=1, old_names=False, trend=trend).fit()
    base_fc = base_model.predict(start=len(y_tr), end=len(y_tr) + h - 1)
    base_pred = float(base_fc.iloc[-1])

    resid = base_model.resid.values
    fitted = (y_tr.iloc[-len(resid):].values - resid)  # yÃÇ_t align√© aux r√©sidus

    boot_preds = []
    for _ in range(B):
        res_b = moving_block_bootstrap(resid, L, rng)   # bootstrap des r√©sidus
        y_b = fitted + res_b                             # s√©rie bootstrap√©e
        m_b = AutoReg(pd.Series(y_b, index=y_tr.index[-len(y_b):]),
                      lags=1, old_names=False, trend=trend).fit()
        fc_b = m_b.predict(start=len(y_tr), end=len(y_tr) + h - 1)
        boot_preds.append(float(fc_b.iloc[-1]))
    return float(np.mean(boot_preds)), np.array(boot_preds), base_pred

In [41]:
# ---------- S√©curisation de la s√©rie y ----------
y = pd.Series(y.astype(float).values, index=pd.to_datetime(y.index)).asfreq("MS").dropna()
print(f"y: {y.index.min().date()} ‚Üí {y.index.max().date()}  (n={len(y)}) | freq={y.index.freqstr}")

y: 1960-01-01 ‚Üí 2025-08-01  (n=788) | freq=MS


In [42]:
# ---------- Boucle pseudo-OOS continue ----------
rows = []
last_model = None
last_fit_end = None

last_t_end = y.index.max() - relativedelta(months=h)

for t_end in y.index:
    if t_end > last_t_end:
        break

    y_tr = y.loc[:t_end]
    if len(y_tr) < max(min_train_n, p_fixed + 1):
        continue

    # fit AR(1) base (utile pour sauvegarde / comparaison)
    ar1 = AutoReg(y_tr, lags=p_fixed, old_names=False, trend=trend).fit()
    last_model = ar1
    last_fit_end = t_end

    # ----- Pr√©vision √† h mois (bagging ou base) -----
    if use_bagging:
        yhat_h, yhat_dist, yhat_h_base = bagged_h_forecast_AR1(
            y_tr=y_tr, h=h, trend=trend, B=B_boot, L=L_block, rng=rng
        )
        yhat_p05 = float(np.percentile(yhat_dist, 5))
        yhat_p95 = float(np.percentile(yhat_dist, 95))
    else:
        fc = ar1.predict(start=len(y_tr), end=len(y_tr) + h - 1)
        yhat_h = float(fc.iloc[-1])
        yhat_h_base = yhat_h
        yhat_p05 = np.nan
        yhat_p95 = np.nan

    t_fore = t_end + relativedelta(months=h)
    if t_fore in y.index:
        rows.append((t_fore, yhat_h, float(y.loc[t_fore]), yhat_p05, yhat_p95, yhat_h_base))

In [43]:
# ---------- DataFrame OOS ----------
if rows:
    df_oos_ar1 = (
        pd.DataFrame(rows, columns=["date", "y_hat", "y_true", "y_hat_p05", "y_hat_p95", "y_hat_base"])
          .set_index("date").sort_index()
    )
else:
    df_oos_ar1 = pd.DataFrame(columns=["y_hat", "y_true", "y_hat_p05", "y_hat_p95", "y_hat_base"])
    df_oos_ar1.index = pd.to_datetime(pd.Index([]))

print(f"\n‚úÖ Pseudo-OOS termin√© ‚Äî n pr√©visions = {len(df_oos_ar1)}")
print(df_oos_ar1.head(3))


‚úÖ Pseudo-OOS termin√© ‚Äî n pr√©visions = 741
               y_hat  y_true  y_hat_p05  y_hat_p95  y_hat_base
date                                                          
1963-12-01  0.070473     0.0  -0.149798   0.286016   -0.080890
1964-01-01  0.017682    -0.1  -0.194683   0.225812    0.141077
1964-02-01  0.090722    -0.5  -0.050079   0.265508    0.408114


In [44]:
# ---------- (facultatif) Scores par p√©riode ----------
if len(df_oos_ar1):
    df_val  = df_oos_ar1.loc["1983-01-01":"1989-12-31"].copy()
    df_test = df_oos_ar1.loc["1990-01-01":"2025-08-31"].copy()

    if len(df_val):
        mae  = mean_absolute_error(df_val["y_true"], df_val["y_hat"])
        rmse = np.sqrt(mean_squared_error(df_val["y_true"], df_val["y_hat"]))
        r2   = r2_score(df_val["y_true"], df_val["y_hat"]) if len(df_val) > 1 else np.nan
        print(f"\nüìä Validation 83‚Äì89 ‚Äî n={len(df_val)} | MAE={mae:.3f} | RMSE={rmse:.3f} | R¬≤={r2:.3f}")

    if len(df_test):
        mae  = mean_absolute_error(df_test["y_true"], df_test["y_hat"])
        rmse = np.sqrt(mean_squared_error(df_test["y_true"], df_test["y_hat"]))
        r2   = r2_score(df_test["y_true"], df_test["y_hat"]) if len(df_test) > 1 else np.nan
        print(f"üìä Test 90‚Äì2025 ‚Äî n={len(df_test)} | MAE={mae:.3f} | RMSE={rmse:.3f} | R¬≤={r2:.3f}")


üìä Validation 83‚Äì89 ‚Äî n=84 | MAE=0.817 | RMSE=1.234 | R¬≤=-0.949
üìä Test 90‚Äì2025 ‚Äî n=428 | MAE=0.867 | RMSE=1.600 | R¬≤=-0.100


In [45]:
# ---------- Sauvegardes ----------
AR1_LAST_PKL  = "AR1_last_trained_model.pkl"
AR1_LAST_META = "AR1_last_trained_model_meta.csv"
AR1_BUNDLE    = "AR1_h12_oos_bundle.pkl"

In [46]:
# 1) mod√®le final
if last_model is not None:
    try:
        joblib.dump(last_model, AR1_LAST_PKL)
        print(f"üíæ Mod√®le AR(1) sauvegard√© ‚Üí {AR1_LAST_PKL}")
    except Exception:
        with open(AR1_LAST_PKL, "wb") as f:
            pickle.dump(last_model, f)
        print(f"üíæ Mod√®le AR(1) sauvegard√© (pickle) ‚Üí {AR1_LAST_PKL}")

# 2) bundle des sorties
bundle = {
    "oos_predictions": (
        df_oos_ar1.reset_index()
                  .rename(columns={"y_hat": "y_pred"})
                  .assign(date=lambda d: pd.to_datetime(d["date"]).dt.to_period("M").dt.to_timestamp(how="start"))
    ),
    "params": {
        "model": "AR(1)",
        "trend": trend,
        "horizon": h,
        "lag": 1,
        "min_train_n": min_train_n,
        # ---- nouveaux champs ----
        "use_bagging": bool(use_bagging),
        "B_boot": int(B_boot),
        "L_block": int(L_block)
    },
    "meta": {
        "trained_until": str(last_fit_end.date()) if last_fit_end is not None else None,
        "index_freq": "MS",
        "n_obs_y": int(len(y)),
        "n_forecasts": int(len(df_oos_ar1))
    }
}
with open(AR1_BUNDLE, "wb") as f:
    pickle.dump(bundle, f)
print(f"üíæ Bundle AR(1) OOS sauvegard√© ‚Üí {AR1_BUNDLE}")

# 3) m√©ta csv
meta_row = {
    "model": "AR(1)",
    "trend": trend,
    "lag": 1,
    "trained_until": str(last_fit_end.date()) if last_fit_end is not None else None,
    "n_obs_y": int(len(y)),
    "n_forecasts": int(len(df_oos_ar1))
}
pd.DataFrame([meta_row]).to_csv(AR1_LAST_META)
print(f"üíæ M√©ta AR(1) sauvegard√©e ‚Üí {AR1_LAST_META}")

üíæ Mod√®le AR(1) sauvegard√© ‚Üí AR1_last_trained_model.pkl
üíæ Bundle AR(1) OOS sauvegard√© ‚Üí AR1_h12_oos_bundle.pkl
üíæ M√©ta AR(1) sauvegard√©e ‚Üí AR1_last_trained_model_meta.csv


# 2. Autoregression en choisissant automatiquement l'ordre de p

In [47]:
# ---------- Param√®tres ----------
h = 12
min_train_n = 36          # ‚â• 3 ans
trend = "c"               # "c" (constante) ou "n" (sans constante)
p_grid = range(1, 13)     # p ‚àà {1,‚Ä¶,12}

cv_update_every_months = 36
cv_anchor = pd.Timestamp("1983-01-01")

# Bagging (comme les auteurs)
use_bagging = True
B_boot = 30               # n_boot ‚âà 30
L_block = 12              # blocs de 12 mois (annuels)
rng = np.random.default_rng(123)  # seed bootstrap

In [48]:
# ---------- Utils ----------
def months_since(anchor, t):
    return (t.year - anchor.year) * 12 + (t.month - anchor.month)

def moving_block_bootstrap(arr, L, rng):
    """Concat√®ne des blocs contigus de taille L tir√©s al√©atoirement jusqu'√† n."""
    n = len(arr)
    L = max(2, min(int(L), n-1))
    nb = int(np.ceil(n / L))
    starts = rng.integers(0, n - L + 1, size=nb)
    out = np.concatenate([arr[s:s+L] for s in starts])[:n]
    return out

def rolling_mae_for_p(y_series, p, h, min_train, trend):
    """MAE rolling √† l'horizon h pour un p donn√© (sur y_series, en respectant l'ordre temporel)."""
    rows = []
    last_t_end = y_series.index.max() - relativedelta(months=h)
    for t_end in y_series.index:
        if t_end > last_t_end:
            break
        y_tr = y_series.loc[:t_end]
        if len(y_tr) < max(min_train, p + 1):
            continue
        model = AutoReg(y_tr, lags=p, old_names=False, trend=trend).fit()
        fc = model.predict(start=len(y_tr), end=len(y_tr) + h - 1)
        yhat_h = float(fc.iloc[-1])
        t_fore = t_end + relativedelta(months=h)
        if t_fore in y_series.index:
            rows.append((t_fore, yhat_h, float(y_series.loc[t_fore])))
    if not rows:
        return np.inf
    tmp = pd.DataFrame(rows, columns=["date", "y_hat", "y_true"]).set_index("date")
    return float(mean_absolute_error(tmp["y_true"], tmp["y_hat"]))

def select_p_by_cv(y_tr, p_grid, h, min_train, trend):
    """S√©lectionne p* minimisant le MAE(h) rolling sur l'√©chantillon d'entra√Ænement courant."""
    best_p, best_score = None, np.inf
    for p in p_grid:
        score = rolling_mae_for_p(y_tr, p, h, min_train, trend)
        if score < best_score:
            best_score, best_p = score, p
    return int(best_p if best_p is not None else 1)

def bagged_h_forecast_ARp(y_tr, p, h, trend, B, L, rng):
    """
    Pr√©vision √† horizon h via bagging (residual moving-block bootstrap) pour AR(p).
    Retourne (yhat_mean, yhat_dist, base_pred).
    """
    base = AutoReg(y_tr, lags=p, old_names=False, trend=trend).fit()
    base_fc = base.predict(start=len(y_tr), end=len(y_tr)+h-1)
    base_pred = float(base_fc.iloc[-1])

    resid = base.resid.values
    fitted = (y_tr.iloc[-len(resid):].values - resid)  # yÃÇ_t align√©

    preds = []
    for _ in range(B):
        res_b = moving_block_bootstrap(resid, L, rng)
        y_b = fitted + res_b
        m_b = AutoReg(pd.Series(y_b, index=y_tr.index[-len(y_b):]),
                      lags=p, old_names=False, trend=trend).fit()
        fc_b = m_b.predict(start=len(y_tr), end=len(y_tr)+h-1)
        preds.append(float(fc_b.iloc[-1]))
    return float(np.mean(preds)), np.array(preds), base_pred

In [49]:
# ---------- Boucle pseudo-OOS ----------
rows = []
last_model = None
last_fit_end = None
current_p = None

last_t_end = y.index.max() - relativedelta(months=h)

for t_end in y.index:
    if t_end > last_t_end:
        break

    y_tr = y.loc[:t_end]
    if len(y_tr) < min_train_n:
        continue

    # Re-CV √† partir de 1983-01 tous les 36 mois
    if t_end >= cv_anchor:
        m = months_since(cv_anchor, t_end)
        need_cv = (m % cv_update_every_months == 0)
    else:
        need_cv = False

    if current_p is None and not need_cv:
        current_p = 1  # valeur initiale avant la premi√®re CV

    if need_cv:
        current_p = select_p_by_cv(y_tr, p_grid, h, min_train_n, trend)
        print(f"[CV] {t_end.date()} ‚Üí p* = {current_p}")

    # Fit de r√©f√©rence (utile pour meta/sauvegarde)
    arp = AutoReg(y_tr, lags=current_p, old_names=False, trend=trend).fit()
    last_model = arp
    last_fit_end = t_end

    # Pr√©vision √† h mois
    if use_bagging:
        # (Option) reseed par mois pour reproductibilit√© run-to-run :
        # rng = np.random.default_rng(int(t_end.strftime("%Y%m")))
        yhat_h, yhat_dist, yhat_base = bagged_h_forecast_ARp(
            y_tr=y_tr, p=current_p, h=h, trend=trend,
            B=B_boot, L=L_block, rng=rng
        )
        yhat_p05 = float(np.percentile(yhat_dist, 5))
        yhat_p95 = float(np.percentile(yhat_dist, 95))
    else:
        fc = arp.predict(start=len(y_tr), end=len(y_tr) + h - 1)
        yhat_h = float(fc.iloc[-1])
        yhat_base = yhat_h
        yhat_p05 = np.nan
        yhat_p95 = np.nan

    t_fore = t_end + relativedelta(months=h)
    if t_fore in y.index:
        rows.append((t_fore, yhat_h, float(y.loc[t_fore]),
                     int(current_p), yhat_p05, yhat_p95, yhat_base))

[CV] 1983-01-01 ‚Üí p* = 5
[CV] 1986-01-01 ‚Üí p* = 4
[CV] 1989-01-01 ‚Üí p* = 4
[CV] 1992-01-01 ‚Üí p* = 4
[CV] 1995-01-01 ‚Üí p* = 4
[CV] 1998-01-01 ‚Üí p* = 4
[CV] 2001-01-01 ‚Üí p* = 4
[CV] 2004-01-01 ‚Üí p* = 4
[CV] 2007-01-01 ‚Üí p* = 4
[CV] 2010-01-01 ‚Üí p* = 4
[CV] 2013-01-01 ‚Üí p* = 4
[CV] 2016-01-01 ‚Üí p* = 4
[CV] 2019-01-01 ‚Üí p* = 4
[CV] 2022-01-01 ‚Üí p* = 4


In [50]:
# ---------- R√©sultats ----------
if rows:
    df_oos_arp = (
        pd.DataFrame(rows, columns=["date","y_hat","y_true","p_used","y_hat_p05","y_hat_p95","y_hat_base"])
          .set_index("date").sort_index()
    )
else:
    df_oos_arp = pd.DataFrame(columns=["y_hat","y_true","p_used","y_hat_p05","y_hat_p95","y_hat_base"])
    df_oos_arp.index = pd.to_datetime(pd.Index([]))

print(f"\n‚úÖ Pseudo-OOS termin√© ‚Äî n pr√©visions = {len(df_oos_arp)}")
print(df_oos_arp.head(3))

# ---------- Scores par p√©riode ----------
if len(df_oos_arp):
    df_val  = df_oos_arp.loc["1983-01-01":"1989-12-31"].copy()
    df_test = df_oos_arp.loc["1990-01-01":"2025-08-31"].copy()

    if len(df_val):
        mae  = mean_absolute_error(df_val["y_true"], df_val["y_hat"])
        rmse = np.sqrt(mean_squared_error(df_val["y_true"], df_val["y_hat"]))
        r2   = r2_score(df_val["y_true"], df_val["y_hat"]) if len(df_val) > 1 else np.nan
        print(f"\nüìä Validation 83‚Äì89 ‚Äî n={len(df_val)} | MAE={mae:.3f} | RMSE={rmse:.3f} | R¬≤={r2:.3f}")

    if len(df_test):
        mae  = mean_absolute_error(df_test["y_true"], df_test["y_hat"])
        rmse = np.sqrt(mean_squared_error(df_test["y_true"], df_test["y_hat"]))
        r2   = r2_score(df_test["y_true"], df_test["y_hat"]) if len(df_test) > 1 else np.nan
        print(f"üìä Test 90‚Äì2025 ‚Äî n={len(df_test)} | MAE={mae:.3f} | RMSE={rmse:.3f} | R¬≤={r2:.3f}")


‚úÖ Pseudo-OOS termin√© ‚Äî n pr√©visions = 741
               y_hat  y_true  p_used  y_hat_p05  y_hat_p95  y_hat_base
date                                                                  
1963-12-01  0.070473     0.0       1  -0.149798   0.286016   -0.080890
1964-01-01  0.017682    -0.1       1  -0.194683   0.225812    0.141077
1964-02-01  0.090722    -0.5       1  -0.050079   0.265508    0.408114

üìä Validation 83‚Äì89 ‚Äî n=84 | MAE=0.819 | RMSE=1.187 | R¬≤=-0.805
üìä Test 90‚Äì2025 ‚Äî n=428 | MAE=0.865 | RMSE=1.644 | R¬≤=-0.161


In [51]:
# ==========================================
# Sauvegardes ‚Äî AR(p) bagging (h=12)
# ==========================================
ARP_LAST_PKL  = "ARp_last_trained_model.pkl"
ARP_LAST_META = "ARp_last_trained_model_meta.csv"
ARP_BUNDLE    = "ARp_h12_oos_bundle.pkl"

# 1Ô∏è‚É£ Sauvegarde du mod√®le final (le dernier AR(p) entra√Æn√©)
if last_model is not None:
    try:
        joblib.dump(last_model, ARP_LAST_PKL)
        print(f"üíæ Mod√®le AR(p) sauvegard√© ‚Üí {ARP_LAST_PKL}")
    except Exception:
        with open(ARP_LAST_PKL, "wb") as f:
            pickle.dump(last_model, f)
        print(f"üíæ Mod√®le AR(p) sauvegard√© (pickle) ‚Üí {ARP_LAST_PKL}")

# 2Ô∏è‚É£ Sauvegarde du bundle complet : pr√©visions + param√®tres + m√©tadonn√©es
bundle = {
    "oos_predictions": (
        df_oos_arp.reset_index()
                  .rename(columns={"y_hat": "y_pred"})
                  .assign(date=lambda d: pd.to_datetime(d["date"]).dt.to_period("M").dt.to_timestamp(how="start"))
    ),
    "params": {
        "model": "AR(p)",
        "trend": trend,
        "horizon": h,
        "p_grid": list(p_grid),
        "min_train_n": min_train_n,
        "cv_update_every_months": cv_update_every_months,
        "cv_anchor": str(cv_anchor.date()),
        # ---- param√®tres de bagging ----
        "use_bagging": bool(use_bagging),
        "B_boot": int(B_boot),
        "L_block": int(L_block)
    },
    "meta": {
        "trained_until": str(last_fit_end.date()) if last_fit_end is not None else None,
        "index_freq": "MS",
        "n_obs_y": int(len(y)),
        "n_forecasts": int(len(df_oos_arp)),
        "mean_p_used": float(df_oos_arp["p_used"].mean()) if "p_used" in df_oos_arp else np.nan
    }
}

with open(ARP_BUNDLE, "wb") as f:
    pickle.dump(bundle, f)
print(f"üíæ Bundle AR(p) OOS sauvegard√© ‚Üí {ARP_BUNDLE}")

# 3Ô∏è‚É£ Sauvegarde d‚Äôun petit r√©sum√© m√©ta au format CSV
meta_row = {
    "model": "AR(p)",
    "trend": trend,
    "horizon": h,
    "cv_anchor": str(cv_anchor.date()),
    "cv_update_months": cv_update_every_months,
    "trained_until": str(last_fit_end.date()) if last_fit_end is not None else None,
    "n_obs_y": int(len(y)),
    "n_forecasts": int(len(df_oos_arp)),
    "mean_p_used": float(df_oos_arp["p_used"].mean()) if "p_used" in df_oos_arp else np.nan
}

pd.DataFrame([meta_row]).to_csv(ARP_LAST_META, index=False)
print(f"üíæ M√©ta AR(p) sauvegard√©e ‚Üí {ARP_LAST_META}")

üíæ Mod√®le AR(p) sauvegard√© ‚Üí ARp_last_trained_model.pkl
üíæ Bundle AR(p) OOS sauvegard√© ‚Üí ARp_h12_oos_bundle.pkl
üíæ M√©ta AR(p) sauvegard√©e ‚Üí ARp_last_trained_model_meta.csv


# 3. R√©gression lin√©aire

In [20]:
# ---------- Param√®tres g√©n√©raux ----------
h = 12
min_train_n = 36           # ‚â• 3 ans avant de commencer √† pr√©voir
winsor_level = 0.01        # winsorisation (1er/99e percentiles)
norm_var = True            # normaliser ou non
target_col = "UNRATE"      # cible dans df_stationary

# Fen√™tres d'√©valuation / test
eval_start = pd.Timestamp("1983-01-01")
eval_end   = pd.Timestamp("1989-12-31")
test_start = pd.Timestamp("1990-01-01")
test_end   = pd.Timestamp("2025-12-31")   # ajuste si besoin

# ---------- Bagging (bootstrap en blocs) ----------
use_bagging = True
B_boot = 30               # comme les auteurs
L_block = 12              # blocs annuels (12 mois)
rng = np.random.default_rng(123)  # seed bootstrap

# ---------- Fichiers de sortie ----------
LINREG_PKL  = "linear_regression.pkl"        # bundle (dict)
LINREG_META = "linear_regression_meta.csv"   # m√©ta r√©sum√©

In [21]:
# ---------- Pr√©paration df_stationary ----------
def _ensure_ms_index(df):
    """Force un index DatetimeIndex en d√©but de mois (MS)."""
    if "date" in df.columns:
        df = df.set_index("date")
    idx = pd.to_datetime(df.index)
    df = df.copy()
    df.index = idx.to_period("M").to_timestamp(how="start")
    return df.asfreq("MS")

# On part de df_stationary (toutes donn√©es : 1960‚Üí2025), d√©j√† charg√© en m√©moire
df_all = _ensure_ms_index(df_stationary).sort_index()

if target_col not in df_all.columns:
    raise ValueError(f"La colonne cible '{target_col}' est absente de df_stationary.")

y_all = df_all[target_col].astype(float)
X_all = df_all.drop(columns=[target_col]).astype(float)
features = list(X_all.columns)

print(f"‚úÖ Donn√©es pr√™tes : {df_all.index.min().date()} ‚Üí {df_all.index.max().date()} | n={len(df_all)} | freq=MS")
print(f"Features ({len(features)}): {features[:6]}{' ...' if len(features)>6 else ''}")

‚úÖ Donn√©es pr√™tes : 1960-01-01 ‚Üí 2025-08-01 | n=788 | freq=MS
Features (10): ['TB3MS', 'RPI', 'INDPRO', 'DPCERA3M086SBEA', 'S&P500', 'BUSLOANS'] ...


In [22]:
# ---------- Pr√©proc ----------
def fit_preproc(X, wins=0.01, do_norm=True):
    """Apprend winsor + normalisation sur TRAIN et renvoie (X_trans, prep)."""
    lower = X.quantile(wins)
    upper = X.quantile(1 - wins)
    Xw = X.clip(lower=lower, upper=upper, axis=1)
    if do_norm:
        mean = Xw.mean()
        std  = Xw.std().replace(0, 1)
        Xn   = (Xw - mean) / std
        prep = {"lower": lower, "upper": upper, "mean": mean, "std": std, "norm": True}
        return Xn, prep
    else:
        prep = {"lower": lower, "upper": upper, "mean": None, "std": None, "norm": False}
        return Xw, prep

def apply_preproc(X, prep):
    """Applique le pr√©proc appris (pas de fuite)."""
    Xp = X.clip(lower=prep["lower"], upper=prep["upper"], axis=1)
    if prep["norm"]:
        Xp = (Xp - prep["mean"]) / prep["std"].replace(0, 1)
    return Xp

In [23]:
# ---------- Bootstrap utils ----------
def block_bootstrap_rows(index, L, rng):
    """
    Moving-block bootstrap sur index (positions).
    Renvoie un array d'indices (longueur = n).
    """
    n = len(index)
    if n < 3:
        return np.arange(n)  # fallback
    L = max(2, min(int(L), n-1))
    nb = int(np.ceil(n / L))
    starts = rng.integers(0, n - L + 1, size=nb)
    ix = np.concatenate([np.arange(s, s+L) for s in starts])[:n]
    return ix

def bagged_predict_linreg(X_tr_raw, y_tr, x_fore_raw, prep, B, L, rng):
    """
    Bagging (moving-block bootstrap) pour LinearRegression :
      - pr√©proc fix√© sur TRAIN original (pas r√©-appris)
      - r√©√©chantillon par blocs (lignes) (X, y)
      - fit et pr√©diction h
      - renvoie (moyenne, distribution compl√®te, base_pred)
    """
    # Base fit (r√©f√©rence)
    X_tr_p = apply_preproc(X_tr_raw, prep)
    base = LinearRegression()
    base.fit(X_tr_p, y_tr.values)
    yhat_base = float(base.predict(apply_preproc(x_fore_raw, prep))[0])

    preds = []
    for _ in range(B):
        ix = block_bootstrap_rows(X_tr_raw.index, L, rng)
        Xb = X_tr_raw.iloc[ix]
        yb = y_tr.iloc[ix]
        Xb_p = apply_preproc(Xb, prep)  # IMPORTANT: m√™me prep
        m = LinearRegression()
        m.fit(Xb_p, yb.values)
        preds.append(float(m.predict(apply_preproc(x_fore_raw, prep))[0]))
    return float(np.mean(preds)), np.array(preds), yhat_base

In [24]:
# ---------- Boucle pseudo-OOS ----------
rows = []                 # (date, y_pred, y_true, y_pred_base, p05, p95)
models = []               # stockage dernier fit (optionnel)
preprocs = []             # stockage prep (optionnel)
train_ends = []           # dates de fin train (pour trace)

last_t_end = y_all.index.max() - relativedelta(months=h)
last_model = None
last_fit_end = None

for t_end in y_all.index:
    if t_end > last_t_end:
        break

    y_tr = y_all.loc[:t_end]
    X_tr = X_all.loc[:t_end]
    if len(y_tr) < min_train_n:
        continue

    # Pr√©proc appris sur TRAIN courant
    X_tr_p, prep = fit_preproc(X_tr, wins=winsor_level, do_norm=norm_var)

    # Horizon cibl√©
    t_fore = t_end + relativedelta(months=h)
    if t_fore in y_all.index:
        x_fore_raw = X_all.loc[[t_fore]]

        if use_bagging:
            # (Option) reseed par mois : rng = np.random.default_rng(int(t_end.strftime("%Y%m")))
            yhat_h, dist, yhat_base = bagged_predict_linreg(
                X_tr_raw=X_tr, y_tr=y_tr, x_fore_raw=x_fore_raw,
                prep=prep, B=B_boot, L=L_block, rng=rng
            )
            y_p05 = float(np.percentile(dist, 5))
            y_p95 = float(np.percentile(dist, 95))
        else:
            model = LinearRegression()
            model.fit(X_tr_p, y_tr.values)
            yhat_h = float(model.predict(apply_preproc(x_fore_raw, prep))[0])
            yhat_base = yhat_h
            y_p05, y_p95 = (np.nan, np.nan)

        rows.append((t_fore, yhat_h, float(y_all.loc[t_fore]), yhat_base, y_p05, y_p95))

    # trace / dernier mod√®le base (utile pour sauvegarde)
    last_model = LinearRegression().fit(X_tr_p, y_tr.values)
    last_fit_end = t_end
    models.append(last_model)
    preprocs.append(prep)
    train_ends.append(t_end)

In [25]:
# ---------- DataFrame OOS ----------
if rows:
    df_oos = (
        pd.DataFrame(rows, columns=["date", "y_pred", "y_true", "y_pred_base", "y_pred_p05", "y_pred_p95"])
          .assign(date=lambda d: pd.to_datetime(d["date"]).dt.to_period("M").dt.to_timestamp(how="start"))
          .set_index("date").sort_index()
    )
else:
    df_oos = pd.DataFrame(columns=["y_pred", "y_true", "y_pred_base", "y_pred_p05", "y_pred_p95"])
    df_oos.index = pd.to_datetime(pd.Index([]))

print(f"\n‚úÖ Pseudo-OOS termin√© ‚Äî n pr√©visions = {len(df_oos)}")
print(df_oos.head(3))

# ---------- Scores ----------
def _scores(df):
    if len(df) == 0:
        return {"MAE": np.nan, "RMSE": np.nan, "R2": np.nan}
    mae  = mean_absolute_error(df["y_true"], df["y_pred"])
    rmse = np.sqrt(mean_squared_error(df["y_true"], df["y_pred"]))
    r2   = r2_score(df["y_true"], df["y_pred"]) if len(df) > 1 else np.nan
    return {"MAE": float(mae), "RMSE": float(rmse), "R2": float(r2)}

df_val  = df_oos.loc[eval_start:eval_end].copy()
df_test = df_oos.loc[test_start:test_end].copy()

sc_val  = _scores(df_val)
sc_test = _scores(df_test)

print(f"\nüìä Validation 83‚Äì89 ‚Äî n={len(df_val)} | MAE={sc_val['MAE']:.3f} | RMSE={sc_val['RMSE']:.3f} | R¬≤={sc_val['R2']:.3f}")
print(f"üìä Test 90‚Äì2025 ‚Äî n={len(df_test)} | MAE={sc_test['MAE']:.3f} | RMSE={sc_test['RMSE']:.3f} | R¬≤={sc_test['R2']:.3f}")

# (option) Comparaison bagging vs base
if "y_pred_base" in df_oos and df_oos["y_pred_base"].notna().any():
    mae_bag  = mean_absolute_error(df_oos["y_true"], df_oos["y_pred"])
    mae_base = mean_absolute_error(df_oos["y_true"], df_oos["y_pred_base"])
    print(f"‚û°Ô∏è  Gain bagging (ŒîMAE) = {mae_base - mae_bag:.3f}")


‚úÖ Pseudo-OOS termin√© ‚Äî n pr√©visions = 741
              y_pred  y_true  y_pred_base  y_pred_p05  y_pred_p95
date                                                             
1963-12-01 -0.729972     0.0    -0.354113   -1.487587   -0.177087
1964-01-01 -0.234660    -0.1    -0.282896   -1.125836    0.670996
1964-02-01  1.067340    -0.5     1.105841   -0.124761    2.173422

üìä Validation 83‚Äì89 ‚Äî n=84 | MAE=0.815 | RMSE=1.024 | R¬≤=-0.342
üìä Test 90‚Äì2025 ‚Äî n=428 | MAE=0.832 | RMSE=1.480 | R¬≤=0.059
‚û°Ô∏è  Gain bagging (ŒîMAE) = -0.012


In [26]:
# ---------- Sauvegardes ----------
bundle = {
    "oos_predictions": df_oos.reset_index(),     # (date, y_pred, y_true, y_pred_base, y_pred_p05, y_pred_p95)
    "params": {
        "model": "LinearRegression",
        "horizon": h,
        "min_train_n": min_train_n,
        "winsor_level": winsor_level,
        "norm_var": norm_var,
        "features": features,
        "eval_window": (str(eval_start.date()), str(eval_end.date())),
        "test_window": (str(test_start.date()), str(test_end.date())),
        # ---- bagging ----
        "use_bagging": bool(use_bagging),
        "B_boot": int(B_boot),
        "L_block": int(L_block),
    },
    "meta": {
        "trained_until": str(last_fit_end.date()) if last_fit_end is not None else None,
        "index_freq": "MS",
        "n_obs_all": int(len(df_all)),
        "n_forecasts": int(len(df_oos)),
    },
    "train_fit_dates": pd.to_datetime(pd.Index(train_ends)),

    # üîªüîªüîª AJOUT ESSENTIEL POUR LA PERMUTATION üîªüîªüîª
    "models":   models,     # liste des mod√®les LinearRegression (un par fen√™tre)
    "preprocs": preprocs,   # liste des pr√©proc (dict) align√©s aux mod√®les
    # üî∫üî∫üî∫
}

# --- Sauvegarde du bundle complet ---
with open(LINREG_PKL, "wb") as f:
    pickle.dump(bundle, f)

# --- Sauvegarde du r√©sum√© m√©ta s√©par√© (lisible rapidement) ---
pd.DataFrame([{
    "model": "LinearRegression",
    "horizon": h,
    "min_train_n": min_train_n,
    "winsor_level": winsor_level,
    "norm_var": norm_var,
    "use_bagging": bool(use_bagging),
    "B_boot": int(B_boot),
    "L_block": int(L_block),
    "trained_until": bundle["meta"]["trained_until"],
    "n_forecasts": bundle["meta"]["n_forecasts"],
}]).to_csv(LINREG_META, index=False)

print(f"\nüíæ Bundle sauvegard√© ‚Üí {LINREG_PKL}")
print(f"üíæ M√©ta sauvegard√©e ‚Üí {LINREG_META}")
print(f"üì¶ Contenu du bundle : {list(bundle.keys())}")


üíæ Bundle sauvegard√© ‚Üí linear_regression.pkl
üíæ M√©ta sauvegard√©e ‚Üí linear_regression_meta.csv
üì¶ Contenu du bundle : ['oos_predictions', 'params', 'meta', 'train_fit_dates', 'models', 'preprocs']


# R√©gression lin√©aire lags de UNRATE

In [27]:
# ==========================================================
# üîπ Linear Regression + Bagging (pseudo-OOS, h=12)
#    ‚Üí ajoute UNRATE_lag12 comme variable explicative
#    ‚Üí gestion robuste des NaN (fit/pred)
#    ‚Üí libell√©s & artefacts "(with UNRATE_lags)"
# ==========================================================
import numpy as np
import pandas as pd
import pickle
from dateutil.relativedelta import relativedelta

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ---------- Param√®tres g√©n√©raux ----------
h = 12
min_train_n = 36           # ‚â• 3 ans avant de commencer √† pr√©voir
winsor_level = 0.01        # winsorisation (1er/99e percentiles)
norm_var = True            # normaliser ou non
target_col = "UNRATE"      # cible dans df_stationary

# Fen√™tres d'√©valuation / test
eval_start = pd.Timestamp("1983-01-01")
eval_end   = pd.Timestamp("1989-12-31")
test_start = pd.Timestamp("1990-01-01")
test_end   = pd.Timestamp("2025-12-31")   # ajuste si besoin

# ---------- Bagging (bootstrap en blocs) ----------
use_bagging = True
B_boot = 30               # comme les auteurs
L_block = 12              # blocs annuels (12 mois)
rng = np.random.default_rng(123)  # seed bootstrap

# ---------- Fichiers de sortie ----------
# (anciens fichiers SANS lags ‚Äî conserv√©s mais non utilis√©s ici)
LINREG_PKL  = "linear_regression.pkl"
LINREG_META = "linear_regression_meta.csv"
# (nouveaux fichiers AVEC lags ‚Äî utilis√©s pour cette variante)
LINREG_LAGS_PKL  = "linear_regression__with_UNRATE_lags_h12.pkl"
LINREG_LAGS_META = "linear_regression_meta__with_UNRATE_lags_h12.csv"

# ---------- Pr√©paration df_stationary ----------
def _ensure_ms_index(df: pd.DataFrame) -> pd.DataFrame:
    """Force un index DatetimeIndex en d√©but de mois (MS)."""
    if "date" in df.columns:
        df = df.set_index("date")
    idx = pd.to_datetime(df.index)
    df = df.copy()
    df.index = idx.to_period("M").to_timestamp(how="start")
    return df.asfreq("MS")

# ‚ö†Ô∏è On part de df_stationary (1960‚Üí2025) d√©j√† charg√© en m√©moire
df_all = _ensure_ms_index(df_stationary).sort_index()

if target_col not in df_all.columns:
    raise ValueError(f"La colonne cible '{target_col}' est absente de df_stationary.")

y_all = df_all[target_col].astype(float)
X_all = df_all.drop(columns=[target_col]).astype(float)

# --- Ajout du lag h (=12) de la cible comme variable explicative ---
lag_feat_name = f"{target_col}_lag{h}"  # 'UNRATE_lag12'
X_all[lag_feat_name] = y_all.shift(h)

features = list(X_all.columns)

print(f"‚úÖ Donn√©es pr√™tes : {df_all.index.min().date()} ‚Üí {df_all.index.max().date()} | n={len(df_all)} | freq=MS")
print(f"Ajout feature: {lag_feat_name} (y_(t-h)) ‚Üí OK")
print(f"Features ({len(features)}): {features[:6]}{' ...' if len(features)>6 else ''}")

# ---------- Pr√©proc ----------
def fit_preproc(X: pd.DataFrame, wins=0.01, do_norm=True):
    """Apprend winsor + normalisation sur TRAIN et renvoie (X_trans, prep)."""
    lower = X.quantile(wins)
    upper = X.quantile(1 - wins)
    Xw = X.clip(lower=lower, upper=upper, axis=1)
    if do_norm:
        mean = Xw.mean()
        std  = Xw.std().replace(0, 1)
        Xn   = (Xw - mean) / std
        prep = {"lower": lower, "upper": upper, "mean": mean, "std": std, "norm": True}
        return Xn, prep
    else:
        prep = {"lower": lower, "upper": upper, "mean": None, "std": None, "norm": False}
        return Xw, prep

def apply_preproc(X: pd.DataFrame, prep: dict):
    """Applique le pr√©proc appris (pas de fuite)."""
    Xp = X.clip(lower=prep["lower"], upper=prep["upper"], axis=1)
    if prep["norm"]:
        Xp = (Xp - prep["mean"]) / prep["std"].replace(0, 1)
    return Xp

# ---------- Bootstrap utils ----------
def block_bootstrap_rows(index, L, rng):
    """
    Moving-block bootstrap sur index (positions).
    Renvoie un array d'indices (longueur = n).
    """
    n = len(index)
    if n < 3:
        return np.arange(n)  # fallback
    L = max(2, min(int(L), n-1))
    nb = int(np.ceil(n / L))
    starts = rng.integers(0, n - L + 1, size=nb)
    ix = np.concatenate([np.arange(s, s+L) for s in starts])[:n]
    return ix

def bagged_predict_linreg(X_tr_raw: pd.DataFrame, y_tr: pd.Series, x_fore_raw: pd.DataFrame,
                          prep: dict, B: int, L: int, rng):
    """
    Bagging (moving-block bootstrap) pour LinearRegression :
      - pr√©proc fix√© sur TRAIN original (pas r√©-appris)
      - drop NaN sur TRAIN apr√®s pr√©proc (LinearRegression n'accepte pas NaN)
      - bootstrap sur l'index propre
      - imputation 0 sur x_fore_p (apr√®s normalisation : 0 = moyenne)
      - renvoie (moyenne, distribution compl√®te, base_pred)
    """
    # Pr√©proc du TRAIN + nettoyage
    X_tr_p = apply_preproc(X_tr_raw, prep)
    mask_clean = X_tr_p.notna().all(axis=1) & y_tr.notna()
    Xc = X_tr_p.loc[mask_clean]
    yc = y_tr.loc[mask_clean]

    if len(Xc) < 5:
        raise ValueError("Trop peu d'observations propres pour bagging LinearRegression.")

    # Pr√©proc de x_fore + imputation 0
    x_fore_p = apply_preproc(x_fore_raw, prep).fillna(0.0)

    # Base fit (r√©f√©rence)
    base = LinearRegression()
    base.fit(Xc.values, yc.values)
    yhat_base = float(base.predict(x_fore_p.values)[0])

    # Bootstrap sur l'index propre
    def _block_bootstrap_rows_from_clean(n_clean, L, rng):
        if n_clean < 3:
            return np.arange(n_clean)
        L_eff = max(2, min(int(L), n_clean - 1))
        nb = int(np.ceil(n_clean / L_eff))
        starts = rng.integers(0, n_clean - L_eff + 1, size=nb)
        ix_pos = np.concatenate([np.arange(s, s + L_eff) for s in starts])[:n_clean]
        return ix_pos

    preds = []
    n_clean = len(Xc)
    for b in range(B):
        ix_pos = _block_bootstrap_rows_from_clean(n_clean, L, rng)
        Xb = Xc.iloc[ix_pos]
        yb = yc.iloc[ix_pos]
        m = LinearRegression()
        m.fit(Xb.values, yb.values)
        preds.append(float(m.predict(x_fore_p.values)[0]))

    return float(np.mean(preds)), np.array(preds), yhat_base

# ---------- Boucle pseudo-OOS ----------
rows = []                 # (date, y_pred, y_true, y_pred_base, p05, p95)
models = []               # stockage des mod√®les (un par fen√™tre)
preprocs = []             # stockage prep (align√©s aux mod√®les)
train_ends = []           # dates de fin train (pour trace)

last_t_end = y_all.index.max() - relativedelta(months=h)
last_model = None
last_fit_end = None

for t_end in y_all.index:
    if t_end > last_t_end:
        break

    y_tr = y_all.loc[:t_end]
    X_tr = X_all.loc[:t_end]
    if len(y_tr) < min_train_n:
        continue

    # Pr√©proc appris sur TRAIN courant
    X_tr_p, prep = fit_preproc(X_tr, wins=winsor_level, do_norm=norm_var)

    # üîß Nettoyage TRAIN (LinearRegression n'accepte pas NaN)
    mask_clean = X_tr_p.notna().all(axis=1) & y_tr.notna()
    X_tr_p_clean = X_tr_p.loc[mask_clean]
    y_tr_clean   = y_tr.loc[mask_clean]
    if len(X_tr_p_clean) < 10:
        continue

    # Horizon cibl√©
    t_fore = t_end + relativedelta(months=h)
    if t_fore in y_all.index:
        x_fore_raw = X_all.loc[[t_fore]]
        # üîß Pas de NaN √† la pr√©diction (apr√®s normalisation, 0 = moyenne)
        x_fore_p = apply_preproc(x_fore_raw, prep).fillna(0.0)

        if use_bagging:
            # (Option) reseed par mois : rng = np.random.default_rng(int(t_end.strftime("%Y%m")))
            yhat_h, dist, yhat_base = bagged_predict_linreg(
                X_tr_raw=X_tr, y_tr=y_tr, x_fore_raw=x_fore_raw,
                prep=prep, B=B_boot, L=L_block, rng=rng
            )
            y_p05 = float(np.percentile(dist, 5))
            y_p95 = float(np.percentile(dist, 95))
        else:
            model = LinearRegression()
            model.fit(X_tr_p_clean.values, y_tr_clean.values)
            yhat_h = float(model.predict(x_fore_p.values)[0])
            yhat_base = yhat_h
            y_p05, y_p95 = (np.nan, np.nan)

        rows.append((t_fore, yhat_h, float(y_all.loc[t_fore]), yhat_base, y_p05, y_p95))

    # trace / dernier mod√®le base (utile pour sauvegarde) ‚Äî entra√Æn√© sur TRAIN propre
    last_model = LinearRegression().fit(X_tr_p_clean.values, y_tr_clean.values)
    last_fit_end = t_end
    models.append(last_model)
    preprocs.append(prep)
    train_ends.append(t_end)

# ---------- DataFrame OOS ----------
if rows:
    df_oos = (
        pd.DataFrame(rows, columns=["date", "y_pred", "y_true", "y_pred_base", "y_pred_p05", "y_pred_p95"])
          .assign(date=lambda d: pd.to_datetime(d["date"]).dt.to_period("M").dt.to_timestamp(how="start"))
          .set_index("date").sort_index()
    )
else:
    df_oos = pd.DataFrame(columns=["y_pred", "y_true", "y_pred_base", "y_pred_p05", "y_pred_p95"])
    df_oos.index = pd.to_datetime(pd.Index([]))

print(f"\n‚úÖ Pseudo-OOS (LinearRegression, with UNRATE_lags) termin√© ‚Äî n pr√©visions = {len(df_oos)}")
print(df_oos.head(3))

# ---------- Scores ----------
def _scores(df: pd.DataFrame):
    if len(df) == 0:
        return {"MAE": np.nan, "RMSE": np.nan, "R2": np.nan}
    mae  = mean_absolute_error(df["y_true"], df["y_pred"])
    rmse = np.sqrt(mean_squared_error(df["y_true"], df["y_pred"]))
    r2   = r2_score(df["y_true"], df["y_pred"]) if len(df) > 1 else np.nan
    return {"MAE": float(mae), "RMSE": float(rmse), "R2": float(r2)}

df_val  = df_oos.loc[eval_start:eval_end].copy()
df_test = df_oos.loc[test_start:test_end].copy()

sc_val  = _scores(df_val)
sc_test = _scores(df_test)

print(f"\nüìä Validation 83‚Äì89 ‚Äî n={len(df_val)} | MAE={sc_val['MAE']:.3f} | RMSE={sc_val['RMSE']:.3f} | R¬≤={sc_val['R2']:.3f}")
print(f"üìä Test 90‚Äì{test_end.year} ‚Äî n={len(df_test)} | MAE={sc_test['MAE']:.3f} | RMSE={sc_test['RMSE']:.3f} | R¬≤={sc_test['R2']:.3f}")

# (option) Comparaison bagging vs base
if "y_pred_base" in df_oos and df_oos["y_pred_base"].notna().any():
    mae_bag  = mean_absolute_error(df_oos["y_true"], df_oos["y_pred"])
    mae_base = mean_absolute_error(df_oos["y_true"], df_oos["y_pred_base"])
    print(f"‚û°Ô∏è  Gain bagging (ŒîMAE) = {mae_base - mae_bag:.3f}")

# ---------- Sauvegardes (VARIANTE AVEC LAGS) ----------
bundle = {
    "oos_predictions": df_oos.reset_index(),     # (date, y_pred, y_true, y_pred_base, y_pred_p05, y_pred_p95)
    "params": {
        "model": "LinearRegression (with UNRATE_lags)",
        "horizon": h,
        "min_train_n": min_train_n,
        "winsor_level": winsor_level,
        "norm_var": norm_var,
        "features": features,
        "eval_window": (str(eval_start.date()), str(eval_end.date())),
        "test_window": (str(test_start.date()), str(test_end.date())),
        # ---- bagging ----
        "use_bagging": bool(use_bagging),
        "B_boot": int(B_boot),
        "L_block": int(L_block),
    },
    "meta": {
        "trained_until": str(last_fit_end.date()) if last_fit_end is not None else None,
        "index_freq": "MS",
        "n_obs_all": int(len(df_all)),
        "n_forecasts": int(len(df_oos)),
    },
    "train_fit_dates": pd.to_datetime(pd.Index(train_ends)),

    # ‚úÖ Pour permutation_importance_pseudo_oos & SHAP
    "models":   models,     # liste des mod√®les LinearRegression (un par fen√™tre)
    "preprocs": preprocs,   # liste des pr√©proc (dict) align√©s aux mod√®les
}

# --- Sauvegarde du bundle complet ---
with open(LINREG_LAGS_PKL, "wb") as f:
    pickle.dump(bundle, f)

# --- Sauvegarde du r√©sum√© m√©ta s√©par√© (lisible rapidement) ---
pd.DataFrame([{
    "model": "LinearRegression (with UNRATE_lags)",
    "horizon": h,
    "min_train_n": min_train_n,
    "winsor_level": winsor_level,
    "norm_var": norm_var,
    "use_bagging": bool(use_bagging),
    "B_boot": int(B_boot),
    "L_block": int(L_block),
    "trained_until": bundle["meta"]["trained_until"],
    "n_forecasts": bundle["meta"]["n_forecasts"],
}]).to_csv(LINREG_LAGS_META, index=False)

print(f"\nüíæ Bundle sauvegard√© ‚Üí {LINREG_LAGS_PKL}")
print(f"üíæ M√©ta sauvegard√©e ‚Üí {LINREG_LAGS_META}")
print(f"üì¶ Contenu du bundle : {list(bundle.keys())}")

‚úÖ Donn√©es pr√™tes : 1960-01-01 ‚Üí 2025-08-01 | n=788 | freq=MS
Ajout feature: UNRATE_lag12 (y_(t-h)) ‚Üí OK
Features (11): ['TB3MS', 'RPI', 'INDPRO', 'DPCERA3M086SBEA', 'S&P500', 'BUSLOANS'] ...

‚úÖ Pseudo-OOS (LinearRegression, with UNRATE_lags) termin√© ‚Äî n pr√©visions = 741
              y_pred  y_true  y_pred_base  y_pred_p05  y_pred_p95
date                                                             
1963-12-01  1.522262     0.0    -0.287805   -0.106405    2.786944
1964-01-01  1.112628    -0.1    -0.999985   -0.566013    2.626656
1964-02-01 -0.865312    -0.5    -0.151814   -2.011440   -0.067756

üìä Validation 83‚Äì89 ‚Äî n=84 | MAE=0.798 | RMSE=1.013 | R¬≤=-0.315
üìä Test 90‚Äì2025 ‚Äî n=428 | MAE=0.796 | RMSE=1.507 | R¬≤=0.024
‚û°Ô∏è  Gain bagging (ŒîMAE) = -0.005

üíæ Bundle sauvegard√© ‚Üí linear_regression__with_UNRATE_lags_h12.pkl
üíæ M√©ta sauvegard√©e ‚Üí linear_regression_meta__with_UNRATE_lags_h12.csv
üì¶ Contenu du bundle : ['oos_predictions', 'params', 'me