# LightGBM – Real Data (v2) — robustere Defaults

**Ziel:** bessere Robustheit für monatliche ifo-Paneldaten mit \(h=1\), ohne Target-only Blöcke.


In [1]:

import os, sys
from pathlib import Path
import numpy as np, pandas as pd

PROJECT_ROOT = Path.cwd().resolve().parent
os.environ["PROJECT_ROOT"] = str(PROJECT_ROOT)
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.config import GlobalConfig, DEFAULT_CORR_SPEC, EWMA_CORR_SPEC, outputs_for_model, STAGEA_DIR, STAGEB_DIR
from src.tuning import run_stageA, run_stageB
from src.io_timesplits import load_target, load_ifo_features
import src.features as F
from src.models.lgbm import ForecastModel

print("PROJECT_ROOT:", PROJECT_ROOT)


PROJECT_ROOT: /Users/jonasschernich/Documents/Masterarbeit/Code


In [2]:

# 1) Daten laden
y = load_target()
X = load_ifo_features()
idx = y.index.intersection(X.index)
y, X = y.loc[idx], X.loc[idx]
print("Shapes:", X.shape, y.shape, "| Dates:", y.index.min(), "→", y.index.max())


Shapes: (407, 2160) (407,) | Dates: 1991-02-01 00:00:00 → 2024-12-01 00:00:00


In [3]:
print("y_train.std():", y.std())

y_train.std(): 1.9053044058244144


In [12]:
y[1:240].std()

1.5139174564356337

In [3]:

# 2) Target-only aus
def _empty_block(y_series, I_t: int, W: int=12):
    import pandas as pd
    return pd.DataFrame(index=[y_series.index[I_t-1]])
F.tsfresh_block = _empty_block
F.chronos_block = _empty_block


In [4]:
# =============== Quick Checks ===============
from pathlib import Path
import numpy as np, pandas as pd
from src.models.lgbm import ForecastModel
from src.config import GlobalConfig, DEFAULT_CORR_SPEC

print("---- BACKEND CHECK ----")
print("ForecastModel.get_name():", ForecastModel({}).get_name())
print("(Erwartet: 'lgbm[lightgbm]'; wenn 'hgb_fallback', ist LightGBM nicht installiert.)")

print("\n---- TARGET CHECK ----")
print("y.head():")
display(y.head())
print("y.describe():")
display(y.describe())
print("y |abs|-Quantile (0.5/0.9/0.99):", y.abs().quantile([0.5,0.9,0.99]).to_dict())
# Wenn hier Mittelwerte/Streuung absurd groß sind, liest du vermutlich Levels statt ΔIP ein.

print("\n---- RAW DESIGN (vor FE) – Basisdiagnose im ersten Trainfenster ----")
cfg_tmp = GlobalConfig()
cfg_tmp.corr_spec = dict(DEFAULT_CORR_SPEC)
W0 = getattr(cfg, "W0_A", 180)  # nimm deine echte Config, falls vorhanden
train_idx = y.index[:W0]
X0 = X.loc[train_idx]
y0 = y.loc[train_idx]

const_cols = (X0.std(ddof=1) == 0).sum()
nan_cols = X0.isna().any(axis=0).sum()
print(f"N={len(y0)}, P={X0.shape[1]}, const_cols={int(const_cols)}, nan_cols={int(nan_cols)}, y_std={y0.std(ddof=1):.4f}")

# Optional: Top-10 Features mit der höchsten (naiven) |corr| zu y_{t+1}
try:
    y_lead = y.shift(-1).loc[train_idx]
    corr = X0.corrwith(y_lead).abs().sort_values(ascending=False).head(10)
    print("\nTop-10 |corr|(roh, expanding) mit y_{t+1} im ersten Trainfenster:")
    display(corr)
except Exception as e:
    print("Konnte Corr-Schnelltest nicht berechnen:", e)

print("\nHinweis:")
print("- Wenn const_cols hoch ist, dropp im FE Schritt konstante/low-unique Spalten.")
print("- Wenn y_std ~ 0, stimmt der Target-Import (ΔIP) vermutlich nicht.")
print("- Wenn Backend nicht 'lgbm[lightgbm]', bitte LightGBM installieren.")
# ===========================================


---- BACKEND CHECK ----
ForecastModel.get_name(): lgbm[lightgbm]
(Erwartet: 'lgbm[lightgbm]'; wenn 'hgb_fallback', ist LightGBM nicht installiert.)

---- TARGET CHECK ----
y.head():


date
1991-02-01   -2.085890
1991-03-01    0.000000
1991-04-01   -0.877193
1991-05-01   -1.137800
1991-06-01    3.069054
Name: IP_change, dtype: float64

y.describe():


count    407.000000
mean       0.042741
std        1.905304
min      -18.219895
25%       -0.822431
50%        0.105485
75%        1.063048
max       10.000000
Name: IP_change, dtype: float64

y |abs|-Quantile (0.5/0.9/0.99): {0.5: 0.9193054136874274, 0.9: 2.4252741856277606, 0.99: 6.768464285714269}

---- RAW DESIGN (vor FE) – Basisdiagnose im ersten Trainfenster ----


NameError: name 'cfg' is not defined

In [5]:

# 3) FE & Policy
cfg = GlobalConfig()
cfg.corr_spec = dict(DEFAULT_CORR_SPEC)   # stabiler Start; EWMA optional testen
cfg.nuisance_seasonal = "auto"
cfg.lag_candidates = tuple(range(1, 12+1))
cfg.top_k_lags_per_feature = 1
cfg.use_rm3 = False                       # Trees: oft besser ohne zu glätten
cfg.k1_topk = 600
cfg.redundancy_method = "greedy"
cfg.redundancy_param = 0.9
cfg.dr_method = "none"
cfg.W0_A = 180; cfg.BLOCKS_A = [(181,200),(201,220),(221,240)]
cfg.W0_B = 240
cfg.policy_window = 12; cfg.policy_gain_min = 0.05; cfg.policy_cooldown = 6
cfg.refresh_cadence_months     = 6
cfg.to_dict()



{'seed': 123,
 'refresh_cadence_months': 6,
 'nuisance_seasonal': 'auto',
 'corr_spec': {'mode': 'expanding', 'window': None, 'lam': None},
 'lag_candidates': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
 'top_k_lags_per_feature': 1,
 'use_rm3': False,
 'k1_topk': 600,
 'screen_threshold': None,
 'redundancy_method': 'greedy',
 'redundancy_param': 0.9,
 'dr_method': 'none',
 'pca_var_target': 0.95,
 'pca_kmax': 25,
 'pls_components': 2,
 'W0_A': 180,
 'BLOCKS_A': [(181, 200), (201, 220), (221, 240)],
 'W0_B': 240,
 'policy_window': 12,
 'policy_gain_min': 0.05,
 'policy_cooldown': 6}

In [6]:
model_grid = [
    {"learning_rate": 0.05, "n_estimators": 500, "num_leaves": 31, "min_child_samples": 15,
     "subsample": 0.8, "colsample_bytree": 0.8, "reg_lambda": 5.0, "reg_alpha": 0.5,
     "max_depth": 4},
    {"learning_rate": 0.03, "n_estimators": 600, "num_leaves": 15, "min_child_samples": 20,
     "subsample": 0.9, "colsample_bytree": 0.9, "reg_lambda": 10.0, "reg_alpha": 1.0,
     "max_depth": 3},
    {"learning_rate": 0.10, "n_estimators": 300, "num_leaves": 31, "min_child_samples": 20,
     "subsample": 0.7, "colsample_bytree": 0.7, "reg_lambda": 5.0, "reg_alpha": 0.0,
     "max_depth": 3},
    {"learning_rate": 0.05, "n_estimators": 600, "num_leaves": 63, "min_child_samples": 20,
     "subsample": 0.8, "colsample_bytree": 0.8, "reg_lambda": 15.0, "reg_alpha": 1.0,
     "max_depth": 4},
    {"learning_rate": 0.02, "n_estimators": 800, "num_leaves": 7,  "min_child_samples": 10,
     "subsample": 0.9, "colsample_bytree": 0.9, "reg_lambda": 20.0, "reg_alpha": 2.0,
     "max_depth": 2},
]
robust_defaults = {
    "max_bin": 32,
    "min_data_in_bin": 1,
    "feature_fraction_bynode": 0.8,
    "min_split_gain": 0.0,
    "enable_bundle": False,
    "min_sum_hessian_in_leaf": 1e-3,
}
for d in model_grid:
    d.update(robust_defaults)

print("Grid size:", len(model_grid))


Grid size: 5


In [7]:

# 5) Stage A
model_name = "lgbm_real_v2"
outputs_for_model(model_name)
shortlist = run_stageA(
    model_name=model_name,
    model_ctor=lambda hp: ForecastModel(hp),
    model_grid=model_grid,
    X=X, y=y, cfg=cfg
)
shortlist


[Stage A][Block 1] train_end=180, OOS=181-200 | configs=5
  - Config 1/5: {'learning_rate': 0.05, 'n_estimators': 500, 'num_leaves': 31, 'min_child_samples': 15, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 5.0, 'reg_alpha': 0.5, 'max_depth': 4, 'max_bin': 32, 'min_data_in_bin': 1, 'feature_fraction_bynode': 0.8, 'min_split_gain': 0.0, 'enable_bundle': False, 'min_sum_hessian_in_leaf': 0.001}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000690 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15648
[LightGBM] [Info] Number of data points in the train set: 152, number of used features: 489
[LightGBM] [Info] Start training from score 0.044154
    · Month 1/20 processed | running RMSE=1.8178
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000814 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15936
[Lig

[{'learning_rate': 0.03,
  'n_estimators': 600,
  'num_leaves': 15,
  'min_child_samples': 20,
  'subsample': 0.9,
  'colsample_bytree': 0.9,
  'reg_lambda': 10.0,
  'reg_alpha': 1.0,
  'max_depth': 3,
  'max_bin': 32,
  'min_data_in_bin': 1,
  'feature_fraction_bynode': 0.8,
  'min_split_gain': 0.0,
  'enable_bundle': False,
  'min_sum_hessian_in_leaf': 0.001},
 {'learning_rate': 0.05,
  'n_estimators': 600,
  'num_leaves': 63,
  'min_child_samples': 20,
  'subsample': 0.8,
  'colsample_bytree': 0.8,
  'reg_lambda': 15.0,
  'reg_alpha': 1.0,
  'max_depth': 4,
  'max_bin': 32,
  'min_data_in_bin': 1,
  'feature_fraction_bynode': 0.8,
  'min_split_gain': 0.0,
  'enable_bundle': False,
  'min_sum_hessian_in_leaf': 0.001}]

In [8]:

# 6) Stage B
run_stageB(
    model_name=model_name,
    model_ctor=lambda hp: ForecastModel(hp),
    shortlist=shortlist,
    X=X, y=y, cfg=cfg,
    max_months=None
)
print("Stage B done →", STAGEB_DIR / model_name)


[Stage B] Month origin t=240 | evaluating 2 configs | active=1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000695 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13152
[LightGBM] [Info] Number of data points in the train set: 206, number of used features: 411
[LightGBM] [Info] Start training from score -0.006590
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000746 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13152
[LightGBM] [Info] Number of data points in the train set: 206, number of used features: 411
[LightGBM] [Info] Start training from score -0.006590
[Stage B] Month origin t=241 | evaluating 2 configs | active=2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000763 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bi