In [1]:
# ==============================================================================
# Static Factor Model (SFM) – Thesis Tuning Pipeline
# ==============================================================================
# Spezifikation gemäß Masterarbeit:
# - Setup I & II: Volles FE-Grid (aber DR=None, da SFM intern PCA nutzt).
# - Model: Factors r={2,5,10,20}, Link={OLS, Ridge}, Lambda={1e-4, 1e-2, 1, 100}.
# - Stage A Shortlist: Top 10 frozen.
# - Dynamic FI (Setup III): Strikt Top 250 Features.
# ==============================================================================

import os, sys
from pathlib import Path
import numpy as np
import pandas as pd
from itertools import product

# --- 1) Pfad-Setup ---
def _locate_repo_root(start: Path) -> Path:
    cur = start.resolve()
    for _ in range(6):
        if (cur / "src").exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = _locate_repo_root(NOTEBOOK_DIR)
os.environ["PROJECT_ROOT"] = str(PROJECT_ROOT)
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# --- 2) Imports ---
from src.config import (
    GlobalConfig,
    DEFAULT_CORR_SPEC,
    EWMA_CORR_SPEC,
    outputs_for_model,
)
from src.tuning import run_stageA, run_stageB
from src.io_timesplits import (
    load_target,
    load_ifo_features,
    load_full_lagged_features,
    load_rolling_importance,
)
# WICHTIG: Modell-Import für SFM
from src.models.SFM import ForecastModel

# --- 3) Konfiguration ---
USE_DYNAMIC_FI_PIPELINE = False  # False = Standard Setup (I & II)
USE_BAI_NG_SELECTION = True      # True = r per Bai–Ng IC bestimmen
SEED = 42

if USE_DYNAMIC_FI_PIPELINE:
    MODEL_NAME = "sfm_dynamic_fi"
else:
    MODEL_NAME = "sfm"

outputs_for_model(MODEL_NAME)
print(f"--- Starte Tuning für: {MODEL_NAME} ---")

# --- 4) Daten laden ---
y = load_target()
X_ifo = load_ifo_features()

# Align Indizes
idx_common = y.index.intersection(X_ifo.index)
y = y.loc[idx_common]
X_ifo = X_ifo.loc[idx_common]

X_full_lagged = None
rolling_imp = None
y_fi = None

if USE_DYNAMIC_FI_PIPELINE:
    FI_BASE_DIR = PROJECT_ROOT / "outputs" / "feature_importance" / "outputs_no_missing"
    try:
        X_full_lagged = load_full_lagged_features(base_dir=FI_BASE_DIR)
        rolling_imp   = load_rolling_importance(base_dir=FI_BASE_DIR)

        idx_fi = y.index.intersection(X_full_lagged.index).intersection(rolling_imp.index)
        y_fi          = y.loc[idx_fi]
        X_full_lagged = X_full_lagged.loc[idx_fi]
        rolling_imp   = rolling_imp.loc[idx_fi]
        print(f"Dynamic FI Modus: {X_full_lagged.shape[1]} Features geladen.")
    except FileNotFoundError:
        print("FEHLER: Dynamic FI Artefakte nicht gefunden.")
        sys.exit(1)
else:
    print(f"Full FE Modus (Setup I/II): {X_ifo.shape[1]} Basis-Features.")

# --- 4b) Bai–Ng Faktorauswahl (Variante A, extern im Notebook) ---

def estimate_bai_ng_r(X: np.ndarray, r_candidates) -> int:
    """
    Schätzt r* nach einem Bai–Ng-ähnlichen Kriterium auf Basis eines
    rechteckigen Panels X (T x N).

    IC(r) = log(sigma_r^2) + r * g(N,T),
    sigma_r^2 = durchschnittliche Restvarianz nach r Faktoren.
    """
    X_np = np.asarray(X, dtype=float)
    # Zentrieren über die Zeitdimension
    X_np = X_np - X_np.mean(axis=0, keepdims=True)
    T, N = X_np.shape

    # SVD-basiertes PCA: X / sqrt(T), damit S^2 ~ Eigenwerte von (1/T) X'X
    from numpy.linalg import svd
    U, S, Vt = svd(X_np / np.sqrt(T), full_matrices=False)

    g = (N + T) / (N * T) * np.log(N * T / (N + T))

    ic_values = {}
    for r in r_candidates:
        if r >= min(T, N):
            continue
        # Restvarianz: Summe der eigenvalues ab r+1
        sigma2 = np.sum(S[r:] ** 2) / (N * T)
        ic_values[r] = np.log(sigma2) + r * g

    if not ic_values:
        # Fallback: kleinstes r
        return min(r_candidates)

    r_star = min(ic_values, key=ic_values.get)
    return int(r_star)

BAI_NG_R_STAR = None
if USE_BAI_NG_SELECTION:
    BAI_NG_R_STAR = estimate_bai_ng_r(X_ifo.values, r_candidates=[2, 5, 10, 20])
    print(f"Bai–Ng geschätzte Faktorzahl: r* = {BAI_NG_R_STAR}")

# --- 5) Config Defaults (Thesis Policy) ---
def get_thesis_cfg() -> GlobalConfig:
    cfg = GlobalConfig(preset="thesis")
    cfg.policy_window = 24
    cfg.policy_decay = 0.97
    cfg.selection_mode = "decayed_best"
    return cfg

cfg_obj = get_thesis_cfg()

# --- 6) Grid Definition ---------------------------------------

def build_grid_full_fe():
    """Setup I (ifo) und Setup II (ifo + TargetBlocks)."""

    # A) FE (Standard Thesis Grid)
    lag_candidates = [tuple(range(7))]

    corr_opts = [
        {"corr_spec": dict(DEFAULT_CORR_SPEC)},
        {"corr_spec": dict(EWMA_CORR_SPEC)},
    ]

    k1_opts = [300, 5000, 50000]
    red_opts = [0.9, 1.0]

    # SPEZIALFALL SFM: Kein externes DR (PCA/PLS), da SFM selbst PCA macht.
    dr_opts = [
        {"dr_method": "none"},
    ]

    # B) Setup II (Target Blocks)
    block_opts = [
        None,                                 # Setup I
        ["AR1", "Chronos"],                   # Setup II a
        ["AR1", "Chronos", "TSFresh"]         # Setup II b
    ]

    # C) Weights
    weight_opts = [
        {"sample_weight_decay": None},
        {"sample_weight_decay": 0.99}
    ]

    # D) SFM Hyperparameter (Thesis Summary)
    # Factors r
    if USE_BAI_NG_SELECTION and BAI_NG_R_STAR is not None:
        n_factors_list = [BAI_NG_R_STAR]
    else:
        n_factors_list = [2, 5, 10, 20]

    # Link / Regularisierung
    reg_list = ["ols", "ridge"]
    # Lambda für Ridge
    ridge_alpha_list = [1e-4, 1e-2, 1.0, 100.0]

    svd_solver_list = ["auto"]

    grid = []

    # 1. FE Loop
    for lags, corr, k1, red, dr in product(lag_candidates, corr_opts, k1_opts, red_opts, dr_opts):
        base_fe = {
            "lag_candidates": lags,
            "k1_topk": k1,
            "redundancy_param": red,
            **dr,
            **corr
        }

        # 2. Blocks & Weights
        for blocks, weights in product(block_opts, weight_opts):

            # 3. Model HPs
            for k, reg, solver in product(n_factors_list, reg_list, svd_solver_list):

                # Wenn OLS -> kein Alpha Loop
                if reg == "ols":
                    hp = {
                        **base_fe,
                        "target_block_set": blocks,
                        **weights,
                        "n_factors": k,
                        "reg": "ols",
                        "ridge_alpha": 0.0,
                        "svd_solver": solver,
                        "fit_intercept": True,
                        "seed": SEED
                    }
                    grid.append(hp)

                # Wenn Ridge -> Alpha Loop
                else:
                    for alpha in ridge_alpha_list:
                        hp = {
                            **base_fe,
                            "target_block_set": blocks,
                            **weights,
                            "n_factors": k,
                            "reg": "ridge",
                            "ridge_alpha": alpha,
                            "svd_solver": solver,
                            "fit_intercept": True,
                            "seed": SEED
                        }
                        grid.append(hp)

    return grid

def build_grid_dynamic_fi():
    """Setup III: Dynamic Feature Importance via strict Top-N."""

    n_features_list = [50]  # Fix 50 Features

    weight_opts = [{"sample_weight_decay": None}, {"sample_weight_decay": 0.99}]

    # SFM Params (gleich wie oben)
    if USE_BAI_NG_SELECTION and BAI_NG_R_STAR is not None:
        n_factors_list = [BAI_NG_R_STAR]
    else:
        n_factors_list = [2, 5, 10, 20]

    reg_list = ["ols", "ridge"]
    ridge_alpha_list = [1e-4, 1e-2, 1.0, 100.0]
    svd_solver_list = ["auto"]

    grid = []
    for n_feat, w in product(n_features_list, weight_opts):
        for k, reg, solver in product(n_factors_list, reg_list, svd_solver_list):

            if reg == "ols":
                hp = {
                    "n_features_to_use": n_feat,
                    **w,
                    "n_factors": k,
                    "reg": "ols",
                    "ridge_alpha": 0.0,
                    "svd_solver": solver,
                    "fit_intercept": True,
                    "seed": SEED
                }
                grid.append(hp)
            else:
                for alpha in ridge_alpha_list:
                    hp = {
                        "n_features_to_use": n_feat,
                        **w,
                        "n_factors": k,
                        "reg": "ridge",
                        "ridge_alpha": alpha,
                        "svd_solver": solver,
                        "fit_intercept": True,
                        "seed": SEED
                    }
                    grid.append(hp)
    return grid

# --- 7) Ausführung --------------------------------------------

if USE_DYNAMIC_FI_PIPELINE:
    grid = build_grid_dynamic_fi()
    print(f"Dynamic FI Grid Größe (Setup III): {len(grid)} Konfigurationen.")

    shortlist = run_stageA(
        model_name=MODEL_NAME,
        model_ctor=lambda hp: ForecastModel(hp),
        model_grid=grid,
        X=X_ifo, # Dummy
        y=y_fi,
        cfg=cfg_obj,
        X_full_lagged=X_full_lagged,
        rolling_imp=rolling_imp,
        keep_top_k_final=10,
        min_survivors_per_block=0
    )

    run_stageB(
        model_name=MODEL_NAME,
        model_ctor=lambda hp: ForecastModel(hp),
        shortlist=shortlist,
        X=X_ifo, # Dummy
        y=y_fi,
        cfg=cfg_obj,
        X_full_lagged=X_full_lagged,
        rolling_imp=rolling_imp
    )

else:
    grid = build_grid_full_fe()
    print(f"Full FE Grid Größe (Setup I & II): {len(grid)} Konfigurationen.")

    shortlist = run_stageA(
        model_name=MODEL_NAME,
        model_ctor=lambda hp: ForecastModel(hp),
        model_grid=grid,
        X=X_ifo,
        y=y,
        cfg=cfg_obj,
        keep_top_k_final=10,
        min_survivors_per_block=0
    )

    run_stageB(
        model_name=MODEL_NAME,
        model_ctor=lambda hp: ForecastModel(hp),
        shortlist=shortlist,
        X=X_ifo,
        y=y,
        cfg=cfg_obj
    )

print("\nTuning abgeschlossen.")



--- Starte Tuning für: sfm ---
INFO in load_ifo_features: Renaming columns to ensure validity.
Full FE Modus (Setup I/II): 13 Basis-Features.
Bai–Ng geschätzte Faktorzahl: r* = 10
Full FE Grid Größe (Setup I & II): 360 Konfigurationen.
[Stage A] Using FULL FE (Gleis 1 & 2) pipeline.
[Stage A][Block 1] train_end=180, OOS=181-200 | configs=360
  - Config 1/360
    · Month 5/20 processed | running...RMSE=1.7372
    · Month 10/20 processed | running...RMSE=1.3560
    · Month 15/20 processed | running...RMSE=1.2447
    · Month 20/20 processed | running...RMSE=1.1016
  - Config 2/360
    · Month 5/20 processed | running...RMSE=1.7372
    · Month 10/20 processed | running...RMSE=1.3560
    · Month 15/20 processed | running...RMSE=1.2447
    · Month 20/20 processed | running...RMSE=1.1016
  - Config 3/360
    · Month 5/20 processed | running...RMSE=1.7371
    · Month 10/20 processed | running...RMSE=1.3559
    · Month 15/20 processed | running...RMSE=1.2447
    · Month 20/20 processed | runnin