In [3]:
# --- Setup & Imports ---
import os, sys
from pathlib import Path
import pandas as pd
import numpy as np
from itertools import product  # NEU: für flachere Grid-Erzeugung

# --- 1. Pfad-Setup (wie in ET.ipynb) ---
def _locate_repo_root(start: Path) -> Path:
    cur = start.resolve()
    for _ in range(5):
        if (cur / 'src').exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = _locate_repo_root(NOTEBOOK_DIR)
os.environ['PROJECT_ROOT'] = str(PROJECT_ROOT)
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.config import GlobalConfig, DEFAULT_CORR_SPEC, EWMA_CORR_SPEC, outputs_for_model
from src.tuning import run_stageA, run_stageB
from src.io_timesplits import (
    load_target, load_ifo_features,
    load_full_lagged_features, load_rolling_importance
)
# --- MODELL-IMPORT (LGBM-Wrapper analog ET/EN API) ---
from src.models.LGBM import ForecastModel  # nutzt LightGBM Regressor-Wrapper

print('PROJECT_ROOT =', PROJECT_ROOT)

# --- Pfad zur Feature-Importance-Outputs (wie ET) ---
FI_PATH = PROJECT_ROOT / "outputs" / "feature_importance" / "outputs_no_missing"

# --- MASTER-SCHALTER ---
# False => "Full FE" (normale FE-Pipeline mit ifo + optionalen Target-Blöcken)
# True  => "Dynamic FI" (rolling Feature Importance Top-N)
USE_DYNAMIC_FI_PIPELINE = True

# --- MODELLNAME ---
if USE_DYNAMIC_FI_PIPELINE:
    MODEL_NAME = "lightgbm_dynamic_fi"  # separater Output-Ordner
else:
    MODEL_NAME = "lightgbm"             # originaler Ordner

# Optional: Ressourcen/Seeds
SEED   = 42
N_JOBS = 1

outputs_for_model(MODEL_NAME)
print(f'Modell {MODEL_NAME} wird getunt.')

# --- 2. Daten laden (wie ET) ---
y = load_target()             # ΔIP (DatetimeIndex)
X_ifo = load_ifo_features()   # ifo features (für Full FE)

if USE_DYNAMIC_FI_PIPELINE:
    # Dynamic-FI Artefakte laden
    try:
        X_full_lagged = load_full_lagged_features(base_dir=FI_PATH)
        rolling_imp   = load_rolling_importance(base_dir=FI_PATH)
        idx_fi = y.index.intersection(X_full_lagged.index).intersection(rolling_imp.index)
        y_fi, X_full_lagged, rolling_imp = y.loc[idx_fi], X_full_lagged.loc[idx_fi], rolling_imp.loc[idx_fi]
        print('Dynamic-FI Daten geladen. Shapes:', X_full_lagged.shape, rolling_imp.shape)
    except FileNotFoundError as e:
        print(f"FEHLER: {e}")
        print("Stelle sicher, dass feature_importance.ipynb (entspr. Jobs) vorher gelaufen ist.")
        raise
else:
    X_full_lagged, rolling_imp = None, None
    idx = y.index.intersection(X_ifo.index)
    y, X_ifo = y.loc[idx], X_ifo.loc[idx]
    print('Full-FE Daten geladen. Shapes:', X_ifo.shape, y.shape)

# --- 3. Base config (Splits & Policy wie in der Thesis) ---
def base_cfg() -> GlobalConfig:
    cfg = GlobalConfig(preset="thesis")  # lädt Thesis-Splits (180, 240, ...)
    cfg.policy_window   = 24
    cfg.policy_decay    = 0.95
    cfg.policy_gain_min = 0.03
    cfg.policy_cooldown = 3
    return cfg

cfg0 = base_cfg()

# --- 4. Helper für Korrelations-Spezifikation (gleich wie ET) ---
def make_corr_spec(kind: str) -> dict:
    if kind == 'expanding':
        return dict(DEFAULT_CORR_SPEC)
    elif kind == 'ewm':
        return dict(EWMA_CORR_SPEC)
    else:
        raise ValueError("kind must be 'expanding' oder 'ewm'")

# --- 5. TUNING-GRID (Logik-Switch) ---
if USE_DYNAMIC_FI_PIPELINE:
    # --- GRID FÜR Dynamic FI (Nur Modell-HPs + n_features_to_use) ---
    print("Erstelle HP-Grid für 'Dynamic FI' ...")

    N_FEATURES_TO_USE = 20  # Top-N Features aus rolling FI

    # LGBM-HP-Listen (kompakt)
    n_estimators_list       = [1200]
    learning_rate_list      = [0.02, 0.05]
    num_leaves_list         = [64, 128]
    max_depth_list          = [-1, 6]
    min_child_samples_list  = [20, 50]
    colsample_bytree_list   = [0.8, 1.0]
    subsample_list          = [0.8, 1.0]
    bagging_freq_list       = [0]
    max_bin_list            = [255]
    reg_alpha_list          = [0, 1]
    reg_lambda_list         = [0, 1]
    min_split_gain_list     = [0.0]
    min_child_weight_list   = [0.01]#[1e-3, 1.0]
    early_stopping_rounds_list =[None] #[None, 100]  # val_tail muss dann gesetzt werden
    val_tail_list           = [None]#[None, 24]
    weighting_options = [
        {"sample_weight_decay": None},
        {"sample_weight_decay": 0.98},
    ]

    def build_model_grid_dynamic_fi():
        hp_names = [
            'n_estimators','learning_rate','num_leaves','max_depth','min_child_samples',
            'colsample_bytree','subsample','bagging_freq','max_bin','reg_alpha',
            'reg_lambda','min_split_gain','min_child_weight',
            'early_stopping_rounds','val_tail'
        ]
        hp_lists = [
            n_estimators_list, learning_rate_list, num_leaves_list, max_depth_list, min_child_samples_list,
            colsample_bytree_list, subsample_list, bagging_freq_list, max_bin_list, reg_alpha_list,
            reg_lambda_list, min_split_gain_list, min_child_weight_list,
            early_stopping_rounds_list, val_tail_list
        ]

        grid = []
        for vals in product(*hp_lists):
            base = dict(zip(hp_names, vals))
            base.update({
                'n_features_to_use': N_FEATURES_TO_USE,
                'importance_type': 'gain',
                'seed': SEED,
                'n_jobs': N_JOBS,
            })
            for w in weighting_options:
                hp = dict(base)
                hp.update(w)
                grid.append(hp)
        return grid

    model_grid = build_model_grid_dynamic_fi()

else:
    # --- GRID FÜR Full FE (ET-Logik, aber LGBM-HPs) ---
    print("Erstelle HP-Grid für 'Full FE' ...")

    # A) Feature Engineering Parameter (reduziert wie in ET.ipynb)
    corr_options = [
        #("expanding", make_corr_spec("expanding")),
        ("ewm",       make_corr_spec("ewm")),
    ]
    lag_candidates_list = [(0, 1, 2, 3, 6,7, 8, 9, 10, 11)]
    top_k_lags_list     = [1]      # reduziert
    use_rm3_list        = [True]   # reduziert
    k1_topk_list        = [200, 1000]
    redundancy_param_list = [0.90] # reduziert
    dr_options_list     = [
        {'dr_method': 'none'},
        {'dr_method': 'pca', 'pca_var_target': 0.95, 'pca_kmax': 50},
        #{'dr_method': 'pls', 'pls_components': 8},
    ]

    # B) LGBM Hyperparameter (schlank, ET-Stil)
    n_estimators_list       = [600]
    learning_rate_list      = [0.02]
    num_leaves_list         = [31, 64]
    max_depth_list          = [-1, 6]
    min_child_samples_list  = [300]
    colsample_bytree_list   = [0.8, 1.0]
    subsample_list          = [0.8, 1.0]
    bagging_freq_list       = [0]
    max_bin_list            = [255]
    reg_alpha_list          = [0, 1]
    reg_lambda_list         = [0, 1]
    min_split_gain_list     = [0.0]
    min_child_weight_list   = [1e-3, 1.0]
    early_stopping_rounds_list = [None, 100]  # bei 100 wird val_tail=24 sinnvoll
    val_tail_list           = [None, 24]

    # C) Target Blocks & Weighting
    target_block_options = [None, ["AR1", "Chronos"]]
    weighting_options    = [{"sample_weight_decay": None}]  # reduziert

    # D) Grid zusammensetzen
    def build_model_grid_full_fe():
        hp_grid = []

        # FE/DR-Listen in ein Produkt packen
        fe_lists = [
            lag_candidates_list,      # lags
            top_k_lags_list,          # k_lags
            use_rm3_list,             # rm3
            k1_topk_list,             # k1
            redundancy_param_list,    # red
            dr_options_list,          # dr_opt (dict)
        ]

        # LGBM-HP-Produkt
        lgbm_names = [
            'n_estimators','learning_rate','num_leaves','max_depth','min_child_samples',
            'colsample_bytree','subsample','bagging_freq','max_bin','reg_alpha',
            'reg_lambda','min_split_gain','min_child_weight',
            'early_stopping_rounds','val_tail'
        ]
        lgbm_lists = [
            n_estimators_list, learning_rate_list, num_leaves_list, max_depth_list, min_child_samples_list,
            colsample_bytree_list, subsample_list, bagging_freq_list, max_bin_list, reg_alpha_list,
            reg_lambda_list, min_split_gain_list, min_child_weight_list,
            early_stopping_rounds_list, val_tail_list
        ]

        for (corr_tag, corr_spec) in corr_options:
            for (lags, k_lags, rm3, k1, red, dr_opt) in product(*fe_lists):
                # Einschränkung wie im Original:
                if k1 == 100 and dr_opt['dr_method'] != 'none':
                    continue

                # alle LGBM-Kombis
                for lvals in product(*lgbm_lists):
                    lgbm_hp = dict(zip(lgbm_names, lvals))

                    base = {
                        # FE/DR
                        'lag_candidates': lags,
                        'top_k_lags_per_feature': k_lags,
                        'use_rm3': rm3,
                        'k1_topk': k1,
                        'redundancy_param': red,
                        **dr_opt,
                        'corr_tag': corr_tag,
                        'corr_spec': corr_spec,
                    }

                    for block_set in target_block_options:
                        for w in weighting_options:
                            hp = {
                                **base,
                                **lgbm_hp,
                                'target_block_set': block_set,
                                'importance_type': 'gain',
                                'seed': SEED,
                                'n_jobs': N_JOBS,
                                **w,
                            }
                            hp_grid.append(hp)

        return hp_grid

    model_grid = build_model_grid_full_fe()

print("Optimierte HP-Kombinationen:", len(model_grid))
print("Erstes HP-Set:", model_grid[0] if model_grid else "Grid ist leer")

# --- 6. Stage A/B Lauf (Logik-Switch wie ET) ---
if model_grid:
    if USE_DYNAMIC_FI_PIPELINE:
        # --- Dynamic FI Lauf ---
        shortlist = run_stageA(
            model_name=MODEL_NAME,
            model_ctor=lambda hp: ForecastModel(hp),
            model_grid=model_grid,
            X=X_ifo,  # Platzhalter (nicht genutzt in Dynamic-FI)
            y=y_fi,
            cfg=cfg0,
            keep_top_k_final=min(5, len(model_grid)),
            min_survivors_per_block=max(1, len(model_grid)//4),
            # Dynamic-FI Inputs:
            X_full_lagged=X_full_lagged,
            rolling_imp=rolling_imp,
        )

        run_stageB(
            model_name=MODEL_NAME,
            model_ctor=lambda hp: ForecastModel(hp),
            shortlist=shortlist,
            X=X_ifo,  # Platzhalter
            y=y_fi,
            cfg=cfg0,
            X_full_lagged=X_full_lagged,
            rolling_imp=rolling_imp,
        )
    else:
        # --- Full FE Lauf ---
        shortlist = run_stageA(
            model_name=MODEL_NAME,
            model_ctor=lambda hp: ForecastModel(hp),
            model_grid=model_grid,
            X=X_ifo,
            y=y,
            cfg=cfg0,
            keep_top_k_final=min(5, len(model_grid)),
            min_survivors_per_block=max(1, len(model_grid)//4),
        )

        run_stageB(
            model_name=MODEL_NAME,
            model_ctor=lambda hp: ForecastModel(hp),
            shortlist=shortlist,
            X=X_ifo,
            y=y,
            cfg=cfg0,
        )
else:
    print("Keine gültigen HP-Kombinationen gefunden, Stages übersprungen.")

print(f"\nDone. Check outputs/stageA|stageB/{MODEL_NAME} for results.")



PROJECT_ROOT = /Users/jonasschernich/Documents/Masterarbeit/Code
Modell lightgbm_dynamic_fi wird getunt.
INFO in load_ifo_features: Renaming columns to ensure validity.
Dynamic-FI Daten geladen. Shapes: (407, 2160) (407, 2160)
Erstelle HP-Grid für 'Dynamic FI' ...
Optimierte HP-Kombinationen: 512
Erstes HP-Set: {'n_estimators': 1200, 'learning_rate': 0.02, 'num_leaves': 64, 'max_depth': -1, 'min_child_samples': 20, 'colsample_bytree': 0.8, 'subsample': 0.8, 'bagging_freq': 0, 'max_bin': 255, 'reg_alpha': 0, 'reg_lambda': 0, 'min_split_gain': 0.0, 'min_child_weight': 0.01, 'early_stopping_rounds': None, 'val_tail': None, 'n_features_to_use': 20, 'importance_type': 'gain', 'seed': 42, 'n_jobs': 1, 'sample_weight_decay': None}
[Stage A] Using DYNAMIC FI (Gleis 3) pipeline.
[Stage A][Block 1] train_end=180, OOS=181-200 | configs=512
  - Config 1/512
    · Month 5/20 processed | running...RMSE=1.2558
    · Month 10/20 processed | running...RMSE=1.0078
    · Month 15/20 processed | running..