In [1]:
# ==============================================================================
# LightGBM – Thesis Tuning Pipeline
# ==============================================================================
# Spezifikation gemäß Masterarbeit:
# - Setup I & II: Volles FE-Grid.
# - Setup III: Dynamic FI (Top 50 Features).
# - Model Hyperparameter exakt nach Thesis Summary.
# - Early Stopping: Aktiviert mit val_tail=24.
# - Stage A Shortlist: Top 10 frozen.
# ==============================================================================

import os, sys
from pathlib import Path
import numpy as np
import pandas as pd
from itertools import product

# --- 1) Pfad-Setup ---
def _locate_repo_root(start: Path) -> Path:
    cur = start.resolve()
    for _ in range(6):
        if (cur / "src").exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = _locate_repo_root(NOTEBOOK_DIR)
os.environ["PROJECT_ROOT"] = str(PROJECT_ROOT)
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# --- 2) Imports ---
from src.config import (
    GlobalConfig,
    DEFAULT_CORR_SPEC,
    EWMA_CORR_SPEC,
    outputs_for_model,
)
from src.tuning import run_stageA, run_stageB
from src.io_timesplits import (
    load_target,
    load_ifo_features,
    load_full_lagged_features,
    load_rolling_importance,
)
# WICHTIG: Modell-Import für LGBM
from src.models.LGBM import ForecastModel

# --- 3) Konfiguration ---
USE_DYNAMIC_FI_PIPELINE = True  # False = Standard Setup (I & II)
SEED = 42
N_JOBS = 1 # LightGBM parallelisiert intern, aber im Grid-Search oft 1 besser

if USE_DYNAMIC_FI_PIPELINE:
    MODEL_NAME = "lightgbm_dynamic_fi"
else:
    MODEL_NAME = "lightgbm_dummy"

outputs_for_model(MODEL_NAME)
print(f"--- Starte Tuning für: {MODEL_NAME} ---")

# --- 4) Daten laden ---
y = load_target()
X_ifo = load_ifo_features()

# Align Indizes
idx_common = y.index.intersection(X_ifo.index)
y = y.loc[idx_common]
X_ifo = X_ifo.loc[idx_common]

X_full_lagged = None
rolling_imp = None
y_fi = None

if USE_DYNAMIC_FI_PIPELINE:
    FI_BASE_DIR = PROJECT_ROOT / "outputs" / "feature_importance" / "outputs_no_missing"
    try:
        X_full_lagged = load_full_lagged_features(base_dir=FI_BASE_DIR)
        rolling_imp   = load_rolling_importance(base_dir=FI_BASE_DIR)

        idx_fi = y.index.intersection(X_full_lagged.index).intersection(rolling_imp.index)
        y_fi          = y.loc[idx_fi]
        X_full_lagged = X_full_lagged.loc[idx_fi]
        rolling_imp   = rolling_imp.loc[idx_fi]
        print(f"Dynamic FI Modus: {X_full_lagged.shape[1]} Features geladen.")
    except FileNotFoundError:
        print("FEHLER: Dynamic FI Artefakte nicht gefunden.")
        sys.exit(1)
else:
    print(f"Full FE Modus (Setup I/II): {X_ifo.shape[1]} Basis-Features.")

# --- 5) Config Defaults (Thesis Policy) ---
def get_thesis_cfg() -> GlobalConfig:
    cfg = GlobalConfig(preset="thesis")
    cfg.policy_window = 24
    cfg.policy_decay = 0.97
    cfg.selection_mode = "decayed_best"
    return cfg

cfg_obj = get_thesis_cfg()

# --- 6) Grid Definition ---------------------------------------

def build_grid_full_fe():
    """Setup I (ifo) und Setup II (ifo + TargetBlocks)."""

    # A) FE & DR (Standard Thesis Grid)
    lag_candidates = [tuple(range(7))]

    corr_opts = [
        {"corr_spec": dict(DEFAULT_CORR_SPEC)},
        {"corr_spec": dict(EWMA_CORR_SPEC)},
    ]

    k1_opts = [700]
    red_opts = [0.9, 1.0]

    dr_opts = [
        {"dr_method": "none"},
        {"dr_method": "pca", "pca_kmax": 30, "pca_var_target": 0.99},
        {"dr_method": "pls", "pls_components": 30},
    ]

    # B) Setup II (Target Blocks)
    block_opts = [
        #None,                                 # Setup I
        ["AR1", "Chronos", "TSFresh"]         # Setup II b
    ]

    # C) Weights
    weight_opts = [
        {"sample_weight_decay": None},
        {"sample_weight_decay": 0.99}
    ]

    # D) LightGBM Hyperparameter (Thesis Summary)
    # n_estimators groß, Kontrolle via Early Stopping (val_tail=24, patience=50)
    # LGBM Params (Gleiches Set wie oben)
    n_estimators_list = [5000]
    early_stopping_rounds_list = [50]
    val_tail_list = [24]

    learning_rate_list = [0.02, 0.05]
    num_leaves_list = [31, 63, 127]
    max_depth_list = [4, 7, -1]
    min_child_samples_list = [50, 150]
    colsample_bytree_list = [0.8]
    reg_lambda_list = [0, 1, 10]



    grid = []

    # 1. FE Loop
    for lags, corr, k1, red, dr in product(lag_candidates, corr_opts, k1_opts, red_opts, dr_opts):
        base_fe = {
            "lag_candidates": lags,
            "k1_topk": k1,
            "redundancy_param": red,
            **dr,
            **corr
        }

        # 2. Blocks & Weights
        for blocks, weights in product(block_opts, weight_opts):

            # 3. Model HPs
            for lr, leaves, depth, min_child, colsample, lam, est, esr, vt in product(
                learning_rate_list, num_leaves_list, max_depth_list,
                min_child_samples_list, colsample_bytree_list, reg_lambda_list,
                n_estimators_list, early_stopping_rounds_list, val_tail_list
            ):
                hp = {
                    **base_fe,
                    "target_block_set": blocks,
                    **weights,
                    "n_estimators": est,
                    "learning_rate": lr,
                    "num_leaves": leaves,
                    "max_depth": depth,
                    "min_child_samples": min_child,
                    "colsample_bytree": colsample,
                    "reg_lambda": lam,

                    # Early Stopping Config
                    "early_stopping_rounds": esr,
                    "val_tail": vt,

                    # Fixed Params
                    "min_child_weight": 1e-3, # Standard
                    "importance_type": "gain",
                    "n_jobs": N_JOBS,
                    "seed": SEED,
                }
                grid.append(hp)

    return grid

def build_grid_dynamic_fi():
    """Setup III: Dynamic Feature Importance via strict Top-N."""

    n_features_list = [50]  # Fix 5 Features

    weight_opts = [{"sample_weight_decay": None}, {"sample_weight_decay": 0.99}]

    # LGBM Params (Gleiches Set wie oben)
    n_estimators_list = [5000]
    early_stopping_rounds_list = [50]
    val_tail_list = [24]

    learning_rate_list = [0.02, 0.05]
    num_leaves_list = [31, 63, 127]
    max_depth_list = [4, 7, -1]
    min_child_samples_list = [50, 150]
    colsample_bytree_list = [0.6, 0.9]
    reg_lambda_list = [1, 10]

    grid = []
    for n_feat, w in product(n_features_list, weight_opts):
        for lr, leaves, depth, min_child, colsample, lam, est, esr, vt in product(
            learning_rate_list, num_leaves_list, max_depth_list,
            min_child_samples_list, colsample_bytree_list, reg_lambda_list,
            n_estimators_list, early_stopping_rounds_list, val_tail_list
        ):
            hp = {
                "n_features_to_use": n_feat,
                **w,
                "n_estimators": est,
                "learning_rate": lr,
                "num_leaves": leaves,
                "max_depth": depth,
                "min_child_samples": min_child,
                "colsample_bytree": colsample,
                "reg_lambda": lam,
                "early_stopping_rounds": esr,
                "val_tail": vt,
                "min_child_weight": 1e-3,
                "importance_type": "gain",
                "n_jobs": N_JOBS,
                "seed": SEED
            }
            grid.append(hp)
    return grid

# --- 7) Ausführung --------------------------------------------

if USE_DYNAMIC_FI_PIPELINE:
    grid = build_grid_dynamic_fi()
    print(f"Dynamic FI Grid Größe (Setup III): {len(grid)} Konfigurationen.")

    shortlist = run_stageA(
        model_name=MODEL_NAME,
        model_ctor=lambda hp: ForecastModel(hp),
        model_grid=grid,
        X=X_ifo, # Dummy
        y=y_fi,
        cfg=cfg_obj,
        X_full_lagged=X_full_lagged,
        rolling_imp=rolling_imp,
        keep_top_k_final=5,
        min_survivors_per_block=5
    )

    run_stageB(
        model_name=MODEL_NAME,
        model_ctor=lambda hp: ForecastModel(hp),
        shortlist=shortlist,
        X=X_ifo, # Dummy
        y=y_fi,
        cfg=cfg_obj,
        X_full_lagged=X_full_lagged,
        rolling_imp=rolling_imp
    )

else:
    grid = build_grid_full_fe()
    print(f"Full FE Grid Größe (Setup I & II): {len(grid)} Konfigurationen.")

    # WARNUNG: Das Grid ist riesig. Stage A Filterung ist kritisch.

    shortlist = run_stageA(
        model_name=MODEL_NAME,
        model_ctor=lambda hp: ForecastModel(hp),
        model_grid=grid,
        X=X_ifo,
        y=y,
        cfg=cfg_obj,
        keep_top_k_final=5,
        min_survivors_per_block=5
    )

    run_stageB(
        model_name=MODEL_NAME,
        model_ctor=lambda hp: ForecastModel(hp),
        shortlist=shortlist,
        X=X_ifo,
        y=y,
        cfg=cfg_obj
    )

print("\nTuning abgeschlossen.")

--- Starte Tuning für: lightgbm_dynamic_fi ---
INFO in load_ifo_features: Renaming columns to ensure validity.
Dynamic FI Modus: 4320 Features geladen.
Dynamic FI Grid Größe (Setup III): 288 Konfigurationen.
[Stage A] Using DYNAMIC FI (Gleis 3) pipeline.
[Stage A][Block 1] train_end=180, OOS=181-200 | configs=288
  - Config 1/288
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[217]	valid_0's l2: 1.4601
    · Month 5/20 processed | running...RMSE=1.6798
    · Month 10/20 processed | running...RMSE=1.4289
    · Month 15/20 processed | running...RMSE=1.3638
    · Month 20/20 processed | running...RMSE=1.2456
  - Config 2/288
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[214]	valid_0's l2: 1.46151
    · Month 5/20 processed | running...RMSE=1.6900
    · Month 10/20 processed | running...RMSE=1.4203
    · Month 15/20 processed | running...RMSE=1.3494
    · Month 20/20 processed | running...RM