# Forecast Workflow — Multi-Model
Load → optional manual base-feature filter (by Branche) → FE/FS → Model chunks → Run → Evaluate/Save

In [1]:

# CHUNK 1 — Load & optional manual Branchen-Filter
import sys, os, re, json
sys.path.append(os.path.abspath("../src"))
import pandas as pd
from tsforecast.types import FeatureSelectCfg, FeEngCfg
from tsforecast.evaluation.metrics import get_metric
from tsforecast.rolling.online import online_rolling_forecast
from tsforecast.rolling import online

X = pd.read_csv('../data/processed/cleaned_features.csv', parse_dates=["date"], index_col="date")
y_df = pd.read_csv('../data/processed/target.csv', parse_dates=["date"], index_col="date")
y = y_df.iloc[:, 1] if y_df.shape[1] >= 2 else y_df.iloc[:, 0]

# Optional: manuelle Branchen-Auswahl (leerlassen = alle nehmen)
# industry_whitelist = [
#     "Herstellung_von_Nahrungs-_und_Futtermitteln",
#     "Getränkeherstellung",
#     "Herstellung_von_Textilien",
#     "Herstellung_von_Bekleidung",
#     "Herstellung_von_Leder_Lederwaren_und_Schuhen",
#     "Holz-_Flecht-_Korb-_und_Korkwarenherstellung_(ohne_Möbel)",
#     "Papiergewerbe",
#     "Herstellung_von_Druckerzeugnissen",
#     "Kokerei_und_Mineralölverarbeitung",
#     "Herstellung_von_chemischen_Erzeugnissen",
#     "Herstellung_von_pharmazeutischen_Erzeugnissen",
#     "Herstellung_von_Gummi-_und_Kunststoffwaren",
#     "Glas-_Keramikgewerbe_Verarbeitung_von_Steinen_und_Erden",
#     "Metallerzeugung_und_-bearbeitung",
#     "Herstellung_von_Metallerzeugnissen",
#     "Datenverarbeitungsgeräte_elektronische_und_optische_Erzeugnisse",
#     "Herstellung_von_elektrischen_Ausrüstungen",
#     "Maschinenbau",
#     "Herstellung_von_Kraftwagen_und_Kraftwagenteilen",
#     "Sonstiger_Fahrzeugbau",
#     "Herstellung_von_Möbeln",
#     "Herstellung_von_sonstigen_Waren"
# ]


industry_whitelist = [
    "Verarbeitendes_Gewerbe",
    "Verarbeitendes_Gewerbe_(ohne_Ernährungsgewerbe)",
    "Herstellung_von_Vorleistungsgütern",
    "Herstellung_von_Investitionsgütern",
    "Herstellung_von_Konsumgütern_(Ge-_und_Verbrauchsgüter)",
    "Herstellung_von_Gebrauchsgütern",
    "Herstellung_von_Verbrauchsgütern",
    "Herstellung_von_Konsumgütern_(ohne_Ernährungsgewerbe)",
    "Herstellung_von_Verbrauchsgütern_(ohne_Ernährungsgewerbe)",
    "Ernährungsgewerbe_und_Tabakverarbeitung",
    "Herstellung_von_Nahrungs-_und_Futtermitteln",
    "Schlachten_und_Fleischverarbeitung"
]

if industry_whitelist:
    keep = []
    wl = set(industry_whitelist)
    for c in X.columns:
        prefix = c.split('.', 1)[0]
        if prefix in wl:
            keep.append(c)
    X = X[keep]
    print(f"Gefilterte Spalten: {len(keep)}")

idx = X.index.intersection(y.index)
X = X.loc[idx].copy()
y = y.loc[idx].copy()
metric_fn = get_metric('rmse')
print(X.shape, y.shape)


Gefilterte Spalten: 156
(408, 156) (408,)


In [3]:

# CHUNK 2 — FE/FS (gefixt)
from tsforecast.types import FeatureSelectCfg, FeEngCfg
from tsforecast.rolling import online

fe_cfg = FeEngCfg(
    candidate_lag_sets=((3,), (6,3, 2, 1,), (1,)),  # ok
    candidate_rm_sets=((3,),),              # <- HIER: ( (3,), ) statt ((3),)
    candidate_ema_sets=((),),               # ok (kein EMA)
    candidate_pca=((None, None),),          # ok
    per_feature_lags=True,
    optimize_fe_for_all_hp=True,
)

online.PCA_STAGE_DEFAULT = "pre"  # "pre" oder "post"

# Optional: externe Feature-Blöcke (vorgekocht)
#setattr(fe_cfg, "tsfresh_path", "../data/processed/tsfresh_w12_slim.parquet")
setattr(fe_cfg, "tsfresh_path", "../data/processed/tsfresh_w12_full") #<- hier gebe ich einen ordner an mit mehreren parqets die je nur einen teil der features enthalten

# setattr(fe_cfg, "fm_pred_path",  "../data/processed/chronos_one_step.parquet")

# FS sehr hoch => effektiv "keine Selektion"
fs_cfg = FeatureSelectCfg(mode='auto_topk', topk=500, variance_thresh=0.0)




### ElasticNet (linear, stabil, gut mit vielen korrelierten Lags)
- HPO: `alpha` in {1e-3…1}, `l1_ratio` in {0.1…0.9}. 
- FE: Lags 3/6/12 reichen oft; RM/EMA optional.
- FS: `auto_topk` 100–300.
- PCA: optional (post) zur Entkollinearisierung.

In [4]:

# CHUNK 3a — ElasticNet
model_name = "elasticnet"
model_grid = {'alpha':[0.01, 0.1, 0.05], 'l1_ratio':[0.3, 0.7],
              'max_iter':[10000], 'fit_intercept':[True],
              'random_state':[42], 'standardize':[True]}


### RandomForest (nichtlinear, robust)
- HPO: `n_estimators` 300–800, `max_depth` 6–20.
- FE: viele Lags möglich, aber aufpassen auf Dim.
- FS: `auto_topk` 200–400.
- PCA: eher nein (RF skaliert schlecht mit PCs).

In [None]:

# CHUNK 3b — RandomForest
# model_name = "randomforest"
# model_grid = {'n_estimators':[500], 'max_depth':[12], 'n_jobs':[-1], 'random_state':[42]}


### XGBoost (stark, kann GPU)
- HPO: `max_depth` 3–6, `eta`=learning_rate 0.03–0.1, `subsample` 0.7–0.9.
- FE: Lags 3/6/12 + ggf. EMA(3/6).
- FS: 200–400.
- GPU: `use_gpu=True` (erfordert CUDA).

In [None]:

# CHUNK 3c — XGBoost
# model_name = "xgboost"
# model_grid = {'n_estimators':[300], 'learning_rate':[0.05], 'max_depth':[4],
#               'subsample':[0.8], 'colsample_bytree':[0.8], 'use_gpu':[False], 'random_state':[42]}


### LightGBM (schnell, kann GPU)
- HPO: `num_leaves` 15–63, `n_estimators` 300–1000, `learning_rate` 0.03–0.1.
- FE: ähnlich XGB.
- FS: 200–400.
- GPU: `use_gpu=True` (GPU-Build nötig).

In [None]:

# CHUNK 3d — LightGBM
# model_name = "lightgbm"
# model_grid = {'n_estimators':[500], 'learning_rate':[0.05], 'num_leaves':[31],
#               'min_data_in_leaf':[5], 'subsample':[0.8], 'colsample_bytree':[0.8],
#               'use_gpu':[False], 'random_state':[42]}


### TabPFN (Zero/Low-HPO, kann GPU)
- HPO: meistens none; evtl. `use_gpu=True`.
- FE: starke FS nötig (Top-K 100–300); PCA(post) oft gut.
- FS: 100–300.

In [None]:

# CHUNK 3e — TabPFN
# model_name = "tabpfn"
# model_grid = {'use_gpu':[False]}


### Chronos (Foundational Model, baseline/stack)
- Als Modell: ignoriert X, nutzt nur y.
- Als Stacking-Feature: precompute und via `fm_pred_path` einbinden.
- HPO: `model_id` (tiny/mini/small), `use_gpu`.

In [None]:

# CHUNK 3f — Chronos
# model_name = "chronos"
# model_grid = {"model_id": ["amazon/chronos-t5-tiny"], "use_gpu":[False]}
# from tsforecast.types import FeatureSelectCfg
# fs_cfg = FeatureSelectCfg(mode='auto_topk', topk=0, variance_thresh=0.0)  # ignoriert X


### Baselines (Mean, RW, AR1)
- Keine FE/FS nötig. Dienen als Untergrenze.
- AR1 mit/ohne Intercept probieren.

In [3]:

# CHUNK 3g — Baselines
# model_name = "mean";  model_grid = {}
# model_name = "rw";    model_grid = {}
model_name = "ar1";   model_grid = {'fit_intercept':[True]}


In [None]:

# CHUNK 4 — Run
preds, truths, cfglog = online_rolling_forecast(
    X, y,
    initial_window=108, step=1, horizon=1,
    fs_cfg=fs_cfg, fe_cfg=fe_cfg,
    model_name=model_name, model_grid=model_grid, metric_fn=get_metric('rmse'),
    progress=True
)
print(preds.tail())
print(truths.tail())
cfglog.tail()


[init_start] n_hp=6, n_fe=1, expected_evals=6


  c /= stddev[:, None]
  c /= stddev[None, :]


[init_eval] done=1, total=6, hp_idx=1, fe_idx=1, score=1.568344


  c /= stddev[:, None]
  c /= stddev[None, :]


[init_eval] done=2, total=6, hp_idx=2, fe_idx=1, score=1.637557


  c /= stddev[:, None]
  c /= stddev[None, :]


[init_eval] done=3, total=6, hp_idx=3, fe_idx=1, score=1.023548


  c /= stddev[:, None]
  c /= stddev[None, :]


[init_eval] done=4, total=6, hp_idx=4, fe_idx=1, score=1.078622


  c /= stddev[:, None]
  c /= stddev[None, :]


In [5]:

# CHUNK 5 — Evaluate & Save
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np, os, json

rmse = np.sqrt(mean_squared_error(truths, preds))
mae  = mean_absolute_error(truths, preds)
mse  = mean_squared_error(truths, preds)
print(f'RMSE={rmse:.4f}, MAE={mae:.4f}, n={len(truths)}')

results = pd.DataFrame({'y_true': truths, 'y_pred': preds})
results.index.name = 'date'

os.makedirs('../reports', exist_ok=True)
# cfglog.to_csv('../reports/rolling_cfglog.csv')
# results.to_csv('../reports/rolling_predictions.csv')

row = pd.DataFrame([{
    'model_name': model_name,
    'model_grid': json.dumps(model_grid, ensure_ascii=False),
    'candidate_lag_sets': json.dumps(tuple(map(list, fe_cfg.candidate_lag_sets)), ensure_ascii=False),
    'candidate_rm_sets':  json.dumps(tuple(map(list, fe_cfg.candidate_rm_sets)), ensure_ascii=False),
    'candidate_ema_sets': json.dumps(tuple(map(list, fe_cfg.candidate_ema_sets)), ensure_ascii=False),
    'candidate_pca':      json.dumps(fe_cfg.candidate_pca, ensure_ascii=False),
    'pca_stage':          online.PCA_STAGE_DEFAULT,
    'final_mse':          float(mse),
    'n_steps':            int(len(truths)),
}])
#row.to_csv('../reports/tuning_space_and_final_mse.csv', index=False)
print('Saved ../reports/tuning_space_and_final_mse.csv')


RMSE=2.8363, MAE=1.7928, n=300
Saved ../reports/tuning_space_and_final_mse.csv
