# Forecast Workflow Notebook
Load prepared CSVs, configure the run, and execute rolling one-step-ahead forecasting.

In [1]:
# CHUNK 1 — Imports & Data
import sys, os
sys.path.append(os.path.abspath("../src"))
import pandas as pd
from tsforecast.types import FeatureSelectCfg, FeEngCfg
from tsforecast.evaluation.metrics import get_metric
from tsforecast.rolling.online import online_rolling_forecast


X = pd.read_csv('../data/processed/cleaned_features.csv',  parse_dates=["date"], index_col="date").iloc[1:]
y = pd.read_csv('../data/processed/target.csv', parse_dates=["date"], index_col="date").iloc[1:, 1]
common_idx = X.index.intersection(y.index)
X = X.loc[common_idx]
y = y.loc[common_idx]
print(X.shape, y.shape)

(407, 2160) (407,)


In [2]:
from statsmodels.tsa.stattools import adfuller

# ADF-Test
result = adfuller(y.dropna())  # dropna falls Lücken

print("ADF Statistic:", result[0])
print("p-value:", result[1])

if result[1] < 0.05:
    print("✅ y ist stationär (p < 0.05)")
else:
    print("❌ y ist NICHT stationär (p >= 0.05)")

ADF Statistic: -17.205325106153715
p-value: 6.420299971246067e-30
✅ y ist stationär (p < 0.05)


# Models

## Overview of Config Options

In [None]:
# CHUNK 2 — Configuration (Referenz mit allen Optionen)

metric_fn = get_metric('rmse')

# Feature-Selektion
fs_cfg = FeatureSelectCfg(
    mode='auto_topk',  # "manual" = feste Liste, "auto_topk" = nach Korr. sortiert, "auto_threshold" = min. Korr.
    topk=200,          # nimm die 200 Features mit höchster |Korrelation|
    variance_thresh=0.0 # optional: wirf Spalten mit sehr kleiner Varianz raus
)

# Feature-Engineering
fe_cfg = FeEngCfg(
    # Globale Lags: identisch für alle Features
    candidate_lag_sets=((1,3,6,12), (1,3,6), (1,)),
    # (1,3,6,12) → kurzfristig + saisonal
    # (1,3,6)    → nur kurz & mittelfristig
    # (1,)       → nur AR(1)-ähnlich

    # Rolling Means: gleitende Durchschnitte (shift(1) → keine Zukunft)
    candidate_rm_sets=((), (3,), (3,6)),
    # ()     → keine
    # (3,)   → 3-Monats-Durchschnitt
    # (3,6)  → 3- und 6-Monats-Durchschnitt

    # EMA: Exponentiell gewichtete Mittel (reaktionsschneller als RM)
    candidate_ema_sets=((), (6,), (3,6)),
    # (6,)   → 6-Monats-EMA
    # (3,6)  → 3- und 6-Monats-EMA

    # PCA: Dimension reduzieren nach Standardisierung
    candidate_pca=((None, None), (50, None), (None, 0.95)),
    # (None, None) → keine PCA
    # (50, None)   → fix 50 PCs
    # (None, 0.95) → wähle so viele PCs, dass 95 % Varianz erklärt sind

    # Per-Feature-Lags: True = für jedes Feature eigene Lags per Korr. wählen
    per_feature_lags=False,
    per_feature_candidates=(1,3,6,12),  # nur relevant wenn True
    per_feature_topk=1,                 # wie viele Lags pro Feature behalten?

    # Steuerung: sollen FE-Optionen für *jede* Modell-HP-Kombi neu getestet werden?
    optimize_fe_for_all_hp=False  # False = nur einmal pro Fenster → schneller
)

# Modell
model_name = 'elasticnet'  # alternativ: "rf", "xgb", "lgbm", "mean", "rw", "ar1"

model_grid = {
    # ElasticNet-Parameter
    'alpha': [0.05],        # Regularisierungsstärke
    'l1_ratio': [0.5],      # 0=Ridge, 1=Lasso, dazwischen Mischungen
    'max_iter': [10000],    # Iterationen für Solver
    'fit_intercept': [True],
    'random_state': [42],
    'standardize': [True],  # StandardScaler davor
    # Für RF/XGB/LGBM stehen jeweils andere Parameter bereit (siehe oben)
}

# Rolling-Window-Einstellungen
initial_window = 108  # erste 9 Jahre (~1991–1999) für Initialtraining
step = 1              # 1 Monat Schrittweite
horizon = 1           # 1-Step-ahead Prognose


## Elastic Net Set Up

In [2]:

from tsforecast.rolling import online  # <— neu, direkt hier

metric_fn = get_metric('rmse')

fe_cfg = FeEngCfg(
    candidate_lag_sets=((3,), (6,)),
    candidate_rm_sets=((),),
    candidate_ema_sets=((),),
    candidate_pca=((None, None), (None, 0.95)),
    per_feature_lags=False,
    optimize_fe_for_all_hp=False,
)

# >>> HIER Stage wählen: "pre" ODER "post"
online.PCA_STAGE_DEFAULT = "pre"   # z.B. "pre"
# online.PCA_STAGE_DEFAULT = "post"

fs_cfg = FeatureSelectCfg(mode='auto_topk', topk=200, variance_thresh=0.0)

model_name = "elasticnet"
model_grid = {'alpha':[0.05], 'l1_ratio':[0.5], 'max_iter':[10000],
              'fit_intercept':[True], 'random_state':[42], 'standardize':[True]}




## Baseline Set Up
Wichtig: fe_cfg muss definiert sein

In [9]:


# Mean
#model_name = 'mean'
#model_grid = {}  # keine Hyperparameter nötig

# Random Walk
#model_name = 'rw'   # oder 'randomwalk', 'naive'
#model_grid = {}

# AR(1)
model_name = 'ar1'
model_grid = {
    'fit_intercept': [True],   # oder [False]
}


## Random Forest Set Up

In [13]:
# CHUNK 2 — Configuration (Random Forest)

metric_fn = get_metric('rmse')

fs_cfg = FeatureSelectCfg(mode='auto_topk', topk=200, variance_thresh=0.0)

fe_cfg = FeEngCfg(
    candidate_lag_sets=((1,3,6,12), (1,3,6),),  # typische Lag-Kombis
    candidate_rm_sets=((), (3,6)),                   # optional 3- & 6-Monats-Mittelwerte
    candidate_ema_sets=((), (6,)),                   # optional 6-Monats-EMA
    candidate_pca=((None, None),),                   # hier: keine PCA
    per_feature_lags=False,
    optimize_fe_for_all_hp=False,
)

model_name = 'rf'
model_grid = {
    'n_estimators': [300, 600],   # Anzahl Bäume
    'max_depth': [None, 8, 15],   # maximale Tiefe, None = wachsen bis reiner Knoten
    'min_samples_leaf': [1, 5],   # Mindestanzahl Beobachtungen pro Blatt
    'n_jobs': [-1],               # alle Kerne nutzen
    'random_state': [42],
}

initial_window = 108
step = 1
horizon = 1


## XGBoost Set Up

In [10]:
# CHUNK 2 — Configuration (XGBoost)

metric_fn = get_metric('rmse')

fs_cfg = FeatureSelectCfg(mode='auto_topk', topk=200, variance_thresh=0.0)

fe_cfg = FeEngCfg(
    candidate_lag_sets=((3,), (6,)),   # saisonal + kurz
    candidate_rm_sets=((), (3,)),               # Rolling Mean 3 Monate
    candidate_ema_sets=((),),              # EMA(6)
    candidate_pca=((None, None), (None, 0.95)), # ohne PCA oder mit 95 % Varianz
    per_feature_lags=False,
    optimize_fe_for_all_hp=True,                # vollständige Suche
)

model_name = 'xgb'
model_grid = {
    'n_estimators': [400],       # Bäume
    'learning_rate': [0.05],     # Shrinkage
    'max_depth': [3, 5],         # maximale Baumtiefe
    'subsample': [0.8],          # Zeilen-Sampling
    'colsample_bytree': [0.8],   # Spalten-Sampling
    'reg_lambda': [1.0, 3.0],    # L2-Regularisierung
    'random_state': [42],
}

initial_window = 108
step = 1
horizon = 1


## LightGBM Set Up

In [6]:
# CHUNK 2 — Configuration (LightGBM)
from lightgbm import LGBMRegressor

metric_fn = get_metric('rmse')

fs_cfg = FeatureSelectCfg(mode='auto_topk', topk=50, variance_thresh=0.0)

fe_cfg = FeEngCfg(
    candidate_lag_sets=((3,),(6,),),     # fix: saisonal
    candidate_rm_sets=((6,),),              # keine Rolling Means
    candidate_ema_sets=((),),       # 3- und 6-Monats-EMA
    candidate_pca=((None, None),),        # keine PCA
    per_feature_lags=False,
    optimize_fe_for_all_hp=False,
)

model_name = 'lgbm'
model_grid = {
    'n_estimators': [200, 500],
    'learning_rate': [0.05],
    'num_leaves': [15],      # größere Werte = flexiblere Modelle
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'reg_lambda': [0.0, 1.0],    # L2-Regularisierung
    'random_state': [42],
}

initial_window = 108
step = 1
horizon = 1


## Set Up

In [3]:
# CHUNK 3 — Run rolling forecast
preds, truths, cfglog = online_rolling_forecast(
    X, y,
    initial_window=108, step=1, horizon=1,
    fs_cfg=fs_cfg, fe_cfg=fe_cfg,
    model_name=model_name, model_grid=model_grid, metric_fn=metric_fn,
)
print(preds.tail())
print(truths.tail())
cfglog.tail()

[init_start] n_hp=1, n_fe=4, expected_evals=4
[init_eval] done=1, total=4, hp_idx=1, fe_idx=1, score=1.79077
[init_eval] done=2, total=4, hp_idx=1, fe_idx=2, score=0.224712
[init_eval] done=3, total=4, hp_idx=1, fe_idx=3, score=1.186921
[init_eval] done=4, total=4, hp_idx=1, fe_idx=4, score=0.863961
[init_done] best_score=0.224712
[step_predict] step=1, of=298, date=2000-03-01
[step_search_start] step=1, hp=1, fe=4, expected_evals=4
[step_eval] step=1, done=1, total=4, hp_idx=1, fe_idx=1, score=0.038114
[step_eval] step=1, done=2, total=4, hp_idx=1, fe_idx=2, score=0.915212
[step_eval] step=1, done=3, total=4, hp_idx=1, fe_idx=3, score=0.52142
[step_eval] step=1, done=4, total=4, hp_idx=1, fe_idx=4, score=0.890011
[step_done] step=1, best_score=0.038114
[step_predict] step=2, of=298, date=2000-04-01
[step_search_start] step=2, hp=1, fe=4, expected_evals=4
[step_eval] step=2, done=1, total=4, hp_idx=1, fe_idx=1, score=1.472495
[step_eval] step=2, done=2, total=4, hp_idx=1, fe_idx=2, sco

Unnamed: 0_level_0,used_model_params,used_fe_spec,n_engineered_cols_used,selected_next_score
time_for_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-08-01,"{'alpha': 0.05, 'l1_ratio': 0.5, 'max_iter': 1...","{'pca_stage': 'post', 'pca_n': None, 'pca_var'...",200,2.585749
2024-09-01,"{'alpha': 0.05, 'l1_ratio': 0.5, 'max_iter': 1...","{'pca_stage': 'post', 'pca_n': None, 'pca_var'...",200,0.109691
2024-10-01,"{'alpha': 0.05, 'l1_ratio': 0.5, 'max_iter': 1...","{'pca_stage': 'post', 'pca_n': None, 'pca_var'...",200,0.166255
2024-11-01,"{'alpha': 0.05, 'l1_ratio': 0.5, 'max_iter': 1...","{'pca_stage': 'post', 'pca_n': None, 'pca_var'...",200,1.677883
2024-12-01,"{'alpha': 0.05, 'l1_ratio': 0.5, 'max_iter': 1...","{'pca_stage': 'post', 'pca_n': None, 'pca_var'...",200,2.274474


In [5]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import json
import os

rmse = np.sqrt(mean_squared_error(truths, preds))
mae  = mean_absolute_error(truths, preds)
mse  = mean_squared_error(truths, preds)
print(f'RMSE={rmse:.4f}, MAE={mae:.4f}, n={len(truths)}')

results = pd.DataFrame({'y_true': truths, 'y_pred': preds})
results.index.name = 'date'

os.makedirs('../reports', exist_ok=True)
# Optional: cfglog/results speichern wie gehabt
# cfglog.to_csv('../reports/rolling_cfglog_elasticnet.csv')
# results.to_csv('../reports/rolling_predictions_elasticnet.csv')

# >>> NEU: Tuning-Space + finaler MSE
space_row = pd.DataFrame([{
    'model_name': model_name,
    'model_grid': json.dumps(model_grid, ensure_ascii=False),
    'candidate_lag_sets': json.dumps(tuple(map(list, fe_cfg.candidate_lag_sets)), ensure_ascii=False),
    'candidate_rm_sets':  json.dumps(tuple(map(list, fe_cfg.candidate_rm_sets)), ensure_ascii=False),
    'candidate_ema_sets': json.dumps(tuple(map(list, fe_cfg.candidate_ema_sets)), ensure_ascii=False),
    'candidate_pca':      json.dumps(fe_cfg.candidate_pca, ensure_ascii=False),
    'pca_stage':          online.PCA_STAGE_DEFAULT,
    'final_mse':          float(mse),
    'n_steps':            int(len(truths)),
}])
space_row.to_csv('../reports/tuning_space_and_final_mse.csv', index=False)

print('Saved tuning_space_and_final_mse.csv')


RMSE=2.3082, MAE=1.5392, n=298
Saved tuning_space_and_final_mse.csv
