In [1]:
# ==============================================================================
# Extra Trees – Thesis Tuning Pipeline
# ==============================================================================
# Spezifikation gemäß Masterarbeit:
# - Setup I (ifo only) vs. Setup II (ifo + Target Blocks) im Grid integriert.
# - FE: Lags 0-6 fix, SIS {300, 5k, 50k}, Redundanz {0.9, 1.0}, DR {None, PCA, PLS}.
# - Model: n_est {500,1k,2k}, depth {None,4,8}, leaf {1,5,10}, feat {sqrt,0.1,0.25}.
# - Stage A Shortlist: Top 10 frozen.
# - Dynamic FI (Setup III): Strikt Top 50 Features.
# ==============================================================================

import os, sys
from pathlib import Path
import numpy as np
import pandas as pd
from itertools import product

# --- 1) Pfad-Setup ---
def _locate_repo_root(start: Path) -> Path:
    cur = start.resolve()
    for _ in range(6):
        if (cur / "src").exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = _locate_repo_root(NOTEBOOK_DIR)
os.environ["PROJECT_ROOT"] = str(PROJECT_ROOT)
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# --- 2) Imports ---
from src.config import (
    GlobalConfig,
    DEFAULT_CORR_SPEC,
    EWMA_CORR_SPEC,
    outputs_for_model,
)
from src.tuning import run_stageA, run_stageB
from src.io_timesplits import (
    load_target,
    load_ifo_features,
    load_full_lagged_features,
    load_rolling_importance,
)
# WICHTIG: Modell-Import für Extra Trees
from src.models.ET import ForecastModel

# --- 3) Konfiguration ---
USE_DYNAMIC_FI_PIPELINE = True  # False = Standard Setup (I & II)
SEED = 42

if USE_DYNAMIC_FI_PIPELINE:
    MODEL_NAME = "extra_trees_dynamic_fi"
else:
    MODEL_NAME = "extra_trees"

outputs_for_model(MODEL_NAME)
print(f"--- Starte Tuning für: {MODEL_NAME} ---")

# --- 4) Daten laden ---
y = load_target()
X_ifo = load_ifo_features()

# Align Indizes
idx_common = y.index.intersection(X_ifo.index)
y = y.loc[idx_common]
X_ifo = X_ifo.loc[idx_common]

X_full_lagged = None
rolling_imp = None
y_fi = None

if USE_DYNAMIC_FI_PIPELINE:
    FI_BASE_DIR = PROJECT_ROOT / "outputs" / "feature_importance" / "outputs_no_missing"
    try:
        X_full_lagged = load_full_lagged_features(base_dir=FI_BASE_DIR)
        rolling_imp   = load_rolling_importance(base_dir=FI_BASE_DIR)

        idx_fi = y.index.intersection(X_full_lagged.index).intersection(rolling_imp.index)
        y_fi          = y.loc[idx_fi]
        X_full_lagged = X_full_lagged.loc[idx_fi]
        rolling_imp   = rolling_imp.loc[idx_fi]
        print(f"Dynamic FI Modus: {X_full_lagged.shape[1]} Features geladen.")
    except FileNotFoundError:
        print("FEHLER: Dynamic FI Artefakte nicht gefunden.")
        sys.exit(1)
else:
    print(f"Full FE Modus (Setup I/II): {X_ifo.shape[1]} Basis-Features.")

# --- 5) Config Defaults (Thesis Policy) ---
def get_thesis_cfg() -> GlobalConfig:
    cfg = GlobalConfig(preset="thesis")
    cfg.policy_window = 24
    cfg.policy_decay = 0.97
    cfg.selection_mode = "decayed_best"
    return cfg

cfg_obj = get_thesis_cfg()

# --- 6) Grid Definition ---------------------------------------

def build_grid_full_fe():
    """Setup I (ifo) und Setup II (ifo + TargetBlocks)."""

    # A) FE & DR
    lag_candidates = [tuple(range(7))]

    corr_opts = [
        {"corr_spec": dict(DEFAULT_CORR_SPEC)},
        {"corr_spec": dict(EWMA_CORR_SPEC)},
    ]

    k1_opts = [700]
    red_opts = [0.9, 1.0]

    dr_opts = [
        {"dr_method": "none"},
        {"dr_method": "pca", "pca_kmax": 30, "pca_var_target": 0.99},
        {"dr_method": "pls", "pls_components": 30},
    ]

    # B) Setup II (Target Blocks)
    block_opts = [
        None,                                 # Setup I
        #["AR1", "Chronos", "TSFresh"],                   # Setup II
    ]

    # C) Weights
    weight_opts = [
        {"sample_weight_decay": None},
        {"sample_weight_decay": 0.99}
    ]

    # D) Extra Trees Hyperparameter (Thesis Summary)
    n_estimators_list = [500, 1000, 2000]
    max_depth_list = [None, 4, 8]
    min_samples_leaf_list = [1, 5, 10]
    max_features_list = ['sqrt', 0.1, 0.25]

    grid = []

    # FE Loop
    for lags, corr, k1, red, dr in product(lag_candidates, corr_opts, k1_opts, red_opts, dr_opts):
        base_fe = {
            "lag_candidates": lags,
            "k1_topk": k1,
            "redundancy_param": red,
            **dr,
            **corr
        }
        # Blocks & Weights
        for blocks, weights in product(block_opts, weight_opts):
            # Model HPs
            for n_est, depth, leaf, feat in product(n_estimators_list, max_depth_list, min_samples_leaf_list, max_features_list):
                hp = {
                    **base_fe,
                    "target_block_set": blocks,
                    **weights,
                    "n_estimators": n_est,
                    "max_depth": depth,
                    "min_samples_leaf": leaf,
                    "max_features": feat,
                    "seed": SEED
                }
                grid.append(hp)
    return grid

def build_grid_dynamic_fi():
    """Setup III: Dynamic Feature Importance via strict Top-N."""

    n_features_list = [50]

    weight_opts = [{"sample_weight_decay": None}, {"sample_weight_decay": 0.99}]

    # ET Params
    n_estimators_list = [500, 1000, 2000]
    max_depth_list = [None, 4, 8]
    min_samples_leaf_list = [1, 5, 10]
    max_features_list = ['sqrt', 0.1, 0.25]

    grid = []
    for n_feat, w, n_est, depth, leaf, feat in product(n_features_list, weight_opts, n_estimators_list, max_depth_list, min_samples_leaf_list, max_features_list):
        hp = {
            "n_features_to_use": n_feat,
            **w,
            "n_estimators": n_est,
            "max_depth": depth,
            "min_samples_leaf": leaf,
            "max_features": feat,
            "seed": SEED
        }
        grid.append(hp)
    return grid

# --- 7) Ausführung --------------------------------------------

if USE_DYNAMIC_FI_PIPELINE:
    grid = build_grid_dynamic_fi()
    print(f"Dynamic FI Grid Größe (Setup III): {len(grid)} Konfigurationen.")

    shortlist = run_stageA(
        model_name=MODEL_NAME,
        model_ctor=lambda hp: ForecastModel(hp),
        model_grid=grid,
        X=X_ifo,
        y=y_fi,
        cfg=cfg_obj,
        X_full_lagged=X_full_lagged,
        rolling_imp=rolling_imp,
        keep_top_k_final=5,
        min_survivors_per_block=5
    )

    run_stageB(
        model_name=MODEL_NAME,
        model_ctor=lambda hp: ForecastModel(hp),
        shortlist=shortlist,
        X=X_ifo,
        y=y_fi,
        cfg=cfg_obj,
        X_full_lagged=X_full_lagged,
        rolling_imp=rolling_imp
    )

else:
    grid = build_grid_full_fe()
    print(f"Full FE Grid Größe (Setup I & II): {len(grid)} Konfigurationen.")
    # Grid ist sehr groß (~30k). ASHA wird hier aggressiv filtern.

    shortlist = run_stageA(
        model_name=MODEL_NAME,
        model_ctor=lambda hp: ForecastModel(hp),
        model_grid=grid,
        X=X_ifo,
        y=y,
        cfg=cfg_obj,
        keep_top_k_final=5,
        min_survivors_per_block=5
    )

    run_stageB(
        model_name=MODEL_NAME,
        model_ctor=lambda hp: ForecastModel(hp),
        shortlist=shortlist,
        X=X_ifo,
        y=y,
        cfg=cfg_obj
    )

print("\nTuning abgeschlossen.")

--- Starte Tuning für: extra_trees_dynamic_fi ---
INFO in load_ifo_features: Renaming columns to ensure validity.
Dynamic FI Modus: 4320 Features geladen.
Dynamic FI Grid Größe (Setup III): 162 Konfigurationen.
[Stage A] Using DYNAMIC FI (Gleis 3) pipeline.
[Stage A][Block 1] train_end=180, OOS=181-200 | configs=162
  - Config 1/162
    · Month 5/20 processed | running...RMSE=1.7171
    · Month 10/20 processed | running...RMSE=1.3879
    · Month 15/20 processed | running...RMSE=1.2999
    · Month 20/20 processed | running...RMSE=1.1690
  - Config 2/162
    · Month 5/20 processed | running...RMSE=1.6805
    · Month 10/20 processed | running...RMSE=1.3701
    · Month 15/20 processed | running...RMSE=1.2859
    · Month 20/20 processed | running...RMSE=1.1556
  - Config 3/162
    · Month 5/20 processed | running...RMSE=1.7602
    · Month 10/20 processed | running...RMSE=1.4236
    · Month 15/20 processed | running...RMSE=1.3262
    · Month 20/20 processed | running...RMSE=1.1936
  - Config