In [1]:

# CHUNK 1 — Load & optional manual Branchen-Filter
import sys, os, re, json
sys.path.append(os.path.abspath("../src"))
import pandas as pd
from tsforecast.types import FeatureSelectCfg, FeEngCfg
from tsforecast.evaluation.metrics import get_metric
from tsforecast.rolling.online import online_rolling_forecast
from tsforecast.rolling import online

X = pd.read_csv('../data/processed/cleaned_features.csv', parse_dates=["date"], index_col="date")
y_df = pd.read_csv('../data/processed/target.csv', parse_dates=["date"], index_col="date")
y = y_df.iloc[:, 1] if y_df.shape[1] >= 2 else y_df.iloc[:, 0]

# Optional: manuelle Branchen-Auswahl (leerlassen = alle nehmen)
# industry_whitelist = [
#     "Herstellung_von_Nahrungs-_und_Futtermitteln",
#     "Getränkeherstellung",
#     "Herstellung_von_Textilien",
#     "Herstellung_von_Bekleidung",
#     "Herstellung_von_Leder_Lederwaren_und_Schuhen",
#     "Holz-_Flecht-_Korb-_und_Korkwarenherstellung_(ohne_Möbel)",
#     "Papiergewerbe",
#     "Herstellung_von_Druckerzeugnissen",
#     "Kokerei_und_Mineralölverarbeitung",
#     "Herstellung_von_chemischen_Erzeugnissen",
#     "Herstellung_von_pharmazeutischen_Erzeugnissen",
#     "Herstellung_von_Gummi-_und_Kunststoffwaren",
#     "Glas-_Keramikgewerbe_Verarbeitung_von_Steinen_und_Erden",
#     "Metallerzeugung_und_-bearbeitung",
#     "Herstellung_von_Metallerzeugnissen",
#     "Datenverarbeitungsgeräte_elektronische_und_optische_Erzeugnisse",
#     "Herstellung_von_elektrischen_Ausrüstungen",
#     "Maschinenbau",
#     "Herstellung_von_Kraftwagen_und_Kraftwagenteilen",
#     "Sonstiger_Fahrzeugbau",
#     "Herstellung_von_Möbeln",
#     "Herstellung_von_sonstigen_Waren"
# ]


industry_whitelist = [
    "Verarbeitendes_Gewerbe",
    "Verarbeitendes_Gewerbe_(ohne_Ernährungsgewerbe)",
    "Herstellung_von_Vorleistungsgütern",
    "Herstellung_von_Investitionsgütern",
    "Herstellung_von_Konsumgütern_(Ge-_und_Verbrauchsgüter)",
    "Herstellung_von_Gebrauchsgütern",
    "Herstellung_von_Verbrauchsgütern",
    "Herstellung_von_Konsumgütern_(ohne_Ernährungsgewerbe)",
    "Herstellung_von_Verbrauchsgütern_(ohne_Ernährungsgewerbe)",
    "Ernährungsgewerbe_und_Tabakverarbeitung",
    "Herstellung_von_Nahrungs-_und_Futtermitteln",
    "Schlachten_und_Fleischverarbeitung"
]

if industry_whitelist:
    keep = []
    wl = set(industry_whitelist)
    for c in X.columns:
        prefix = c.split('.', 1)[0]
        if prefix in wl:
            keep.append(c)
    X = X[keep]
    print(f"Gefilterte Spalten: {len(keep)}")

idx = X.index.intersection(y.index)
X = X.loc[idx].copy()
y = y.loc[idx].copy()
metric_fn = get_metric('rmse')
print(X.shape, y.shape)

Gefilterte Spalten: 156
(408, 156) (408,)


In [2]:
import time, sys, os, math
import numpy as np
import pandas as pd
from numpy.lib.stride_tricks import sliding_window_view
from tsfresh import extract_features

# kleine, robuste Param-Menge für Monatsdaten
def _tsfresh_params(mode: str):
    if mode == "slim":
        return {
            "mean": None,
            "median": None,
            "standard_deviation": None,
            "variance": None,
            "quantile": [{"q": 0.25}, {"q": 0.75}],
            "mean_abs_change": None,
            "absolute_sum_of_changes": None,
            "autocorrelation": [{"lag": 1}, {"lag": 3}, {"lag": 6}, {"lag": 12}],
            "fft_coefficient": [{"coeff": 0, "attr": "abs"}, {"coeff": 1, "attr": "abs"}],
        }
    elif mode == "efficient":
        from tsfresh.feature_extraction.settings import EfficientFCParameters
        return EfficientFCParameters()
    elif mode == "comprehensive":
        from tsfresh.feature_extraction.settings import ComprehensiveFCParameters
        return ComprehensiveFCParameters()
    else:
        from tsfresh.feature_extraction.settings import MinimalFCParameters
        return MinimalFCParameters()

def precompute_tsfresh_rolling(
    X: pd.DataFrame,
    window: int = 12,
    mode: str = "slim",
    n_jobs: int = 0,
    log_every: int = 5,
    y: pd.Series | None = None,
    topk_global: int | None = None,
    initial_window: int | None = None,
):
    params = _tsfresh_params(mode)
    idx = X.index
    cols = list(X.columns)
    recs, t0 = [], time.time()

    for i, col in enumerate(cols, 1):
        s = X[col].to_numpy()
        if len(s) < window:
            continue
        try:
            W = sliding_window_view(s, window_shape=window)
            n_win = W.shape[0]
            df_long = pd.DataFrame({
                "id":   np.repeat(np.arange(window-1, window-1+n_win), window),
                "time": np.tile(np.arange(window), n_win),
                "value": W.reshape(-1),
                "kind": col
            })
            feats = extract_features(
                df_long,
                column_id="id", column_sort="time",
                column_kind="kind", column_value="value",
                default_fc_parameters=params,
                disable_progressbar=True, n_jobs=n_jobs
            )
            feats.index = idx[window-1:window-1+n_win]
            feats.columns = [f"tsf_{col}__{c}" for c in feats.columns]
            recs.append(feats)
        except Exception as e:
            print(f"[warn] {col}: {e}")

        if (i % log_every == 0) or (i == len(cols)):
            dt = time.time() - t0
            avg = dt / max(i, 1)
            eta = avg * (len(cols) - i)
            print(f"{i}/{len(cols)} avg {avg:.2f}s  eta {eta/60:.1f}m")
            sys.stdout.flush()

    if not recs:
        return pd.DataFrame(index=idx)

    F = pd.concat(recs, axis=1).sort_index()
    F = F.loc[:, F.var(axis=0).to_numpy() > 0]  # konstante Features weg

    if (y is not None) and (topk_global is not None):
        # gemeinsame Zeitachse
        tr = F.index.intersection(y.index)
        if initial_window:
            tr = tr[:min(initial_window, len(tr))]
        if len(tr) == 0:
            raise ValueError("Kein gemeinsamer Trainingsindex zwischen F und y.")

        # Korr auf dem gemeinsamen Trainingsfenster
        c = F.loc[tr].corrwith(y.loc[tr]).abs()
        c = c.replace([np.inf, -np.inf], np.nan).fillna(0.0)

        # konstante Spalten raus
        nonconst = F.loc[tr].var(axis=0).to_numpy() > 0
        c = c.loc[F.columns[nonconst]]

        k = int(min(topk_global, c.shape[0]))
        keep = list(c.sort_values(ascending=False).head(k).index)
        if len(keep) == 0:
            raise ValueError("Top-K-Auswahl leer – prüfe window/initial_window/Params.")

        print(f"Top-K tsfresh: keep={len(keep)}/{c.shape[0]}")
        F = F[keep]

    return F


In [4]:
# schlanke Datei (z.B. Top-800 über Train-Startfenster 108)
F = precompute_tsfresh_rolling(
    X, window=12, mode="slim", n_jobs=0,
    y=y, topk_global=800, initial_window=108
)
F.to_parquet("../data/processed/tsfresh_w12_slim.parquet")


5/156 avg 0.27s  eta 0.7m
10/156 avg 0.21s  eta 0.5m
15/156 avg 0.19s  eta 0.4m
20/156 avg 0.18s  eta 0.4m
25/156 avg 0.18s  eta 0.4m
30/156 avg 0.17s  eta 0.4m
35/156 avg 0.22s  eta 0.4m
40/156 avg 0.24s  eta 0.5m
45/156 avg 0.24s  eta 0.4m
50/156 avg 0.23s  eta 0.4m
55/156 avg 0.23s  eta 0.4m
60/156 avg 0.22s  eta 0.4m
65/156 avg 0.22s  eta 0.3m
70/156 avg 0.21s  eta 0.3m
75/156 avg 0.21s  eta 0.3m
80/156 avg 0.20s  eta 0.3m
85/156 avg 0.20s  eta 0.2m
90/156 avg 0.20s  eta 0.2m
95/156 avg 0.19s  eta 0.2m
100/156 avg 0.19s  eta 0.2m
105/156 avg 0.19s  eta 0.2m
110/156 avg 0.19s  eta 0.1m
115/156 avg 0.18s  eta 0.1m
120/156 avg 0.18s  eta 0.1m
125/156 avg 0.18s  eta 0.1m
130/156 avg 0.18s  eta 0.1m
135/156 avg 0.18s  eta 0.1m
140/156 avg 0.17s  eta 0.0m
145/156 avg 0.17s  eta 0.0m
150/156 avg 0.17s  eta 0.0m
155/156 avg 0.17s  eta 0.0m
156/156 avg 0.17s  eta 0.0m
Top-K tsfresh: keep=800/2028
