In [3]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# ----------------------------------------------------
# 1. Projekt-Root & Imports aus deinem Projekt
# ----------------------------------------------------
def _locate_repo_root(start: Path) -> Path:
    cur = start.resolve()
    for _ in range(5):
        if (cur / "src").exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = _locate_repo_root(NOTEBOOK_DIR)
os.environ["PROJECT_ROOT"] = str(PROJECT_ROOT)
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("PROJECT_ROOT =", PROJECT_ROOT)

from src.io_timesplits import load_target, load_ifo_features
from src.evaluation import rmse
from src.config import GlobalConfig

# ----------------------------------------------------
# 2. Daten laden & Grundgrößen
# ----------------------------------------------------
print("Lade Daten...")
y = load_target()           # ΔIP (DatetimeIndex)
X_ifo = load_ifo_features() # ifo-Features (un-gelaggt)

# Gemeinsamer Index
idx = y.index.intersection(X_ifo.index)
y = y.loc[idx]
X_ifo = X_ifo.loc[idx]

print(f"Daten geladen. y shape: {y.shape}, X_ifo shape: {X_ifo.shape}")
print(f"Zeitspanne: {y.index.min().date()} -> {y.index.max().date()}")

# Ziel für h=1: y_{t+1}
y_target = y.shift(-1)      # Trainings-Label (für Modelle)
y_true_full = y.copy()      # echte ΔIP(t) (für Auswertung)

# ----------------------------------------------------
# 3. Lag-Matrix bauen (Lags 0..11)
# ----------------------------------------------------
print("\nErstelle lagged Feature-Matrix (Lags 0–11)...")

LAGS_TO_TEST = range(0, 12)  # Lags 0..11
lagged_list = []

n_features = len(X_ifo.columns)
for i, col in enumerate(X_ifo.columns, start=1):
    if i % 200 == 0:
        print(f"  ... erstelle Lags für Feature {i}/{n_features}")
    s = X_ifo[col]
    for lag in LAGS_TO_TEST:
        lagged_list.append(
            s.shift(lag).to_frame(f"{col}__lag{lag}")
        )

X_all_lags = pd.concat(lagged_list, axis=1)

# Finaler Index: dort, wo y_target definiert ist
idx_final = X_all_lags.index.intersection(y_target.dropna().index)
X_all_lags = X_all_lags.loc[idx_final]
y_model_target = y_target.loc[idx_final]
y_true_full = y_true_full.loc[idx_final]

print(f"Vollständige lag-Matrix Shape: {X_all_lags.shape}")
print(f"Finale Zeitspanne für Modellierung: {y_model_target.index.min().date()} -> {y_model_target.index.max().date()}")

# ----------------------------------------------------
# 4. Rolling-Origin-Setup (h=1, wie in deiner Pipeline)
# ----------------------------------------------------
cfg = GlobalConfig(preset="thesis")
SEED = cfg.seed
np.random.seed(SEED)

# Startpunkt des OOS-Fensters aus deinem GlobalConfig (W0_B)
# W0_B ist ein Integer-Index bezogen auf das ursprüngliche y-Array.
# Wir mappen ihn auf y_model_target, das am Anfang einige Monate verlieren kann.
raw_start_date = y.index[cfg.W0_B]

# Starte in y_model_target so nah wie möglich an raw_start_date (bfill, falls am Anfang Lags fehlen)
if raw_start_date in y_model_target.index:
    TRAIN_START_T_idx = y_model_target.index.get_loc(raw_start_date)
else:
    TRAIN_START_T_idx = y_model_target.index.get_indexer([raw_start_date], method="bfill")[0]

T_total = len(y_model_target)

print(f"\nTRAIN_START_T_idx = {TRAIN_START_T_idx} (entspricht Datum {y_model_target.index[TRAIN_START_T_idx].date()})")
print(f"Anzahl OOS-Schritte (ungefähr): {T_total - TRAIN_START_T_idx - 1}")

fixed_ridge_params = {
    "alpha": 1.0,
    "solver": "auto",
    "random_state": SEED
}

def run_rolling_forecast(feature_names):
    """
    Rolling-Origin-Forecast (h=1) ab TRAIN_START_T_idx.
    feature_names: Liste von Spaltennamen aus X_all_lags.

    - Train: bis t-1
    - Prognose: mit X_t
    - Wahrer Wert: y_{t+1}

    Rückgabe: RMSE über den OOS-Zeitraum.
    """

    # Baseline: 0-Features => Naive (immer 0) für ΔIP
    if not feature_names:
        true_vals = y_true_full.iloc[TRAIN_START_T_idx + 1 : T_total].values
        preds = np.zeros_like(true_vals)
        return rmse(true_vals, preds)

    X_subset = X_all_lags[feature_names]

    all_preds = []
    all_true = []

    for t in range(TRAIN_START_T_idx, T_total - 1):
        # Train bis t-1 (Index 0..t-1)
        train_slice = slice(0, t)
        X_tr = X_subset.iloc[train_slice, :]
        y_tr = y_model_target.iloc[train_slice]

        model = make_pipeline(
            StandardScaler(),
            Ridge(**fixed_ridge_params)
        )

        X_tr_clean = X_tr.fillna(0.0)
        model.fit(X_tr_clean, y_tr)

        # Prognose mit X_t
        x_eval = X_subset.iloc[[t], :].fillna(0.0)
        y_hat = model.predict(x_eval)[0]

        # Wahrer Wert: y_{t+1}
        y_true = y_true_full.iloc[t + 1]

        all_preds.append(y_hat)
        all_true.append(y_true)

    return rmse(all_true, all_preds)

print("\nRolling-Forecast-Funktion 'run_rolling_forecast' ist definiert.")

# ----------------------------------------------------
# 5. Feature-Ranking per Forward-Rolling-RMSE (ein Feature nach dem anderen)
# ----------------------------------------------------
print("\nStarte Feature-Ranking mit zeitlicher Forward-Validierung (ein Feature pro Lauf)...")

ALL_FEATURES = list(X_all_lags.columns)

# Optional: Begrenzung, falls du erstmal testen willst (z.B. erste 500 Features)
MAX_FEATURES_TO_RANK = None  # z.B. 500 setzen, um schneller zu sein
if MAX_FEATURES_TO_RANK is not None:
    features_to_rank = ALL_FEATURES[:MAX_FEATURES_TO_RANK]
else:
    features_to_rank = ALL_FEATURES

print(f"Anzahl Features, die gerankt werden: {len(features_to_rank)}")

feature_scores = []

for j, f in enumerate(features_to_rank, start=1):
    score = run_rolling_forecast([f])
    feature_scores.append((score, f))

    if j % 50 == 0 or j == len(features_to_rank):
        print(f"  {j}/{len(features_to_rank)} Features gerankt; letztes: {f}, RMSE = {score:.4f}")

# Sortieren: kleiner RMSE = besser
feature_scores.sort(key=lambda x: x[0])

print("\n=== Ranking abgeschlossen ===")
print(f"Beste 10 Features nach Forward-RMSE (h=1, Ridge, expandierendes Fenster):")
for rank, (score, name) in enumerate(feature_scores[:10], start=1):
    print(f"  {rank}. {name}  -> RMSE = {score:.6f}")

# Die vollständige Rangliste liegt in 'feature_scores'
# (Liste von Tupeln: (rmse, feature_name))



PROJECT_ROOT = /Users/jonasschernich/Documents/Masterarbeit/Code
Lade Daten...
INFO in load_ifo_features: Renaming columns to ensure validity.
Daten geladen. y shape: (407,), X_ifo shape: (407, 2160)
Zeitspanne: 1991-02-01 -> 2024-12-01

Erstelle lagged Feature-Matrix (Lags 0–11)...
  ... erstelle Lags für Feature 200/2160
  ... erstelle Lags für Feature 400/2160
  ... erstelle Lags für Feature 600/2160
  ... erstelle Lags für Feature 800/2160
  ... erstelle Lags für Feature 1000/2160
  ... erstelle Lags für Feature 1200/2160
  ... erstelle Lags für Feature 1400/2160
  ... erstelle Lags für Feature 1600/2160
  ... erstelle Lags für Feature 1800/2160
  ... erstelle Lags für Feature 2000/2160
Vollständige lag-Matrix Shape: (406, 25920)
Finale Zeitspanne für Modellierung: 1991-02-01 -> 2024-11-01

TRAIN_START_T_idx = 240 (entspricht Datum 2011-02-01)
Anzahl OOS-Schritte (ungefähr): 165

Rolling-Forecast-Funktion 'run_rolling_forecast' ist definiert.

Starte Feature-Ranking mit zeitlicher 