# Demo Notebook: LightGBM pipeline (dummy data)
Dieses Notebook generiert Dummy-Daten, lädt die Pipeline-Module und führt eine Mini-Stage-A und kurze Stage-B aus, inkl. CSV-Exports und Progress-Prints. In echten Runs werden die Daten aus `repo/data/processed/` geladen.

In [1]:
import os, sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve().parent   # wenn Notebook in Code/notebooks liegt
os.environ["PROJECT_ROOT"] = str(PROJECT_ROOT)
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.config import GlobalConfig, DEFAULT_CORR_SPEC, EWMA_CORR_SPEC, outputs_for_model

from src.io_timesplits import ensure_outputs   # falls vorhanden
from src.tuning import run_stageA, run_stageB
from src.models.lgbm import ForecastModel

import numpy as np, pandas as pd

# Outputs sicherstellen
outputs_for_model("lgbm")  # reicht; ensure_outputs optional



{'stageA': PosixPath('/Users/jonasschernich/Documents/Masterarbeit/Code/outputs/stageA/lgbm'),
 'stageB': PosixPath('/Users/jonasschernich/Documents/Masterarbeit/Code/outputs/stageB/lgbm'),
 'logs': PosixPath('/Users/jonasschernich/Documents/Masterarbeit/Code/outputs/logs/lgbm.log')}

In [2]:

# --- Generate dummy monthly data ---
rng = np.random.default_rng(42)
T = 160  # keep small to run fast in demo
p = 20

dates = pd.date_range("2000-01-01", periods=T, freq="MS")
# Target: AR(1) noise
y = pd.Series(rng.normal(size=T), index=dates, name="delta_ip")
y = 0.5 * y.shift(1).fillna(0) + y + rng.normal(scale=0.3, size=T)
# Features: noisy lags + random
X = pd.DataFrame(rng.normal(size=(T,p)), index=dates, columns=[f"f{i:03d}" for i in range(p)])
for j in range(min(5, p)):  # make a few features predictive
    X.iloc[:, j] = y.shift(1).fillna(0).values + rng.normal(scale=0.5, size=T)

display(y.head(), X.head())


2000-01-01    0.015182
2000-02-01   -1.105193
2000-03-01    0.869000
2000-04-01    1.069374
2000-05-01   -1.229206
Freq: MS, Name: delta_ip, dtype: float64

Unnamed: 0,f000,f001,f002,f003,f004,f005,f006,f007,f008,f009,f010,f011,f012,f013,f014,f015,f016,f017,f018,f019
2000-01-01,-0.363684,-0.316627,-0.691369,0.126602,-0.665822,0.730775,-1.57204,-0.066953,-1.172007,-0.51828,1.511228,0.637534,-0.69893,-1.013717,0.032782,-1.21656,-0.67114,0.312009,1.155312,0.608761
2000-02-01,0.03535,0.295953,-0.028782,0.462791,-0.648784,-2.063238,-0.591103,0.590906,-1.581594,1.475949,0.368357,0.846584,-0.570944,0.813764,1.068472,0.232878,0.234401,0.270343,-0.863345,-0.147529
2000-03-01,-1.261327,-1.036806,-0.957177,-0.968533,-0.838567,1.481456,-0.743588,-0.82225,0.202306,0.844385,0.011426,1.328961,0.856794,0.84182,0.554117,2.327653,-0.205162,-2.003522,1.604254,-0.457699
2000-04-01,0.076198,0.644855,1.794232,1.988416,0.196265,-0.794136,0.439637,0.524188,0.276274,-1.412766,-2.310103,0.054354,-0.471776,0.459386,0.701954,0.138241,0.760133,0.229211,0.530065,-0.704673
2000-05-01,0.609146,0.544288,1.112374,1.784268,0.956591,-0.265839,-0.117542,0.829519,-1.99306,-1.296472,-1.482185,-2.333616,-0.678264,0.749434,-0.284884,0.19779,1.089217,1.327686,-0.069138,1.353586


In [3]:

# --- Config (use tiny blocks for demo) ---
cfg = GlobalConfig()
# Use EWMA correlation globally or 'expanding'; here try EWMA for demo
cfg.corr_spec = dict(EWMA_CORR_SPEC)
cfg.lag_candidates = tuple(range(1, 4))
cfg.top_k_lags_per_feature = 1
cfg.k1_topk = 20
cfg.redundancy_method = "greedy"
cfg.redundancy_param = 0.9
cfg.dr_method = "none"   # keep fast for demo
cfg.nuisance_seasonal = "off"

# Tiny Stage A blocks for speed (train 1..60, OOS 61..80; 81..100; 101..120)
cfg.W0_A = 60
cfg.BLOCKS_A = [(61, 70), (71, 80), (81, 90)]
cfg.W0_B = 100  # Stage B starts here
cfg.policy_window = 6
cfg.policy_gain_min = 0.05
cfg.policy_cooldown = 2

model_grid = [
    {"learning_rate": 0.05, "n_estimators": 150, "num_leaves": 15, "min_child_samples": 10},
    {"learning_rate": 0.10, "n_estimators": 200, "num_leaves": 31, "min_child_samples": 20},
    {"learning_rate": 0.03, "n_estimators": 250, "num_leaves": 31, "min_child_samples": 20, "subsample": 0.8},
    {"learning_rate": 0.05, "n_estimators": 100, "num_leaves": 7, "min_child_samples": 5},
]


In [4]:

# --- Stage A: run with halving ---
shortlist = run_stageA(
    model_name="lgbm",
    model_ctor=lambda hp: ForecastModel(hp),
    model_grid=model_grid,
    X=X, y=y, cfg=cfg
)
print("Shortlist:", shortlist)


[Stage A][Block 1] train_end=60, OOS=61-70 | configs=4
  - Config 1/4: {'learning_rate': 0.05, 'n_estimators': 150, 'num_leaves': 15, 'min_child_samples': 10}
    · Month 1/10 processed | running RMSE=0.8861
    · Month 2/10 processed | running RMSE=0.7343
    · Month 3/10 processed | running RMSE=0.6278
    · Month 4/10 processed | running RMSE=0.6188
    · Month 5/10 processed | running RMSE=0.5726
    · Month 6/10 processed | running RMSE=0.5439
    · Month 7/10 processed | running RMSE=0.5675
    · Month 8/10 processed | running RMSE=0.5984
    · Month 9/10 processed | running RMSE=0.6024
    · Month 10/10 processed | running RMSE=0.9081
  - Config 2/4: {'learning_rate': 0.1, 'n_estimators': 200, 'num_leaves': 31, 'min_child_samples': 20}
    · Month 1/10 processed | running RMSE=0.9305
    · Month 2/10 processed | running RMSE=0.8464
    · Month 3/10 processed | running RMSE=0.7038
    · Month 4/10 processed | running RMSE=0.6212
    · Month 5/10 processed | running RMSE=0.5964
  

In [5]:

# --- Stage B: quick run over a few months ---
run_stageB(
    model_name="lgbm",
    model_ctor=lambda hp: ForecastModel(hp),
    shortlist=shortlist,
    X=X, y=y, cfg=cfg,
    max_months=6  # keep short for demo
)
print("Stage B done. Check outputs in /mnt/data/repo/outputs/stageA and stageB.")


[Stage B] Month origin t=100 | evaluating 1 configs | active=1
[Stage B] Month origin t=101 | evaluating 1 configs | active=1
[Stage B] Month origin t=102 | evaluating 1 configs | active=1
[Stage B] Month origin t=103 | evaluating 1 configs | active=1
[Stage B] Month origin t=104 | evaluating 1 configs | active=1
[Stage B] Month origin t=105 | evaluating 1 configs | active=1
Stage B done. Check outputs in /mnt/data/repo/outputs/stageA and stageB.
