# TSFresh Slim Precompute (Windowed, Causal)
Erzeugt eine **schlanke**, kausal korrekte tsfresh-Featurematrix als Parquet.
- nutzt `EfficientFCParameters`
- fensterbasiert via `roll_time_series`
- **kein** shift hier; shift(1) passiert beim Merge.

In [6]:
# --- Imports ---
import pandas as pd
import numpy as np
from pathlib import Path
import os, sys  # Hinzugefügt

from tsfresh import extract_features
from tsfresh.feature_extraction.settings import EfficientFCParameters
from tsfresh.utilities.dataframe_functions import roll_time_series

# --- 1. Robuster Pfad-Setup ---
def _locate_repo_root(start: Path) -> Path:
    """Findet das Projektverzeichnis, indem es nach 'src' oder 'data' sucht."""
    cur = start.resolve()
    for _ in range(5):  # Bis zu 5 Ebenen nach oben suchen
        if (cur / 'src').exists() or (cur / 'data').exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    # Fallback: Nimm an, wir sind 2 Ebenen tief (Standard-Notebook-Ordner)
    print("Warnung: Konnte 'src' oder 'data' Ordner nicht finden. Rate Projekt-Root.")
    return start.resolve().parent.parent

NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = _locate_repo_root(NOTEBOOK_DIR)
print(f"PROJECT_ROOT gefunden in: {PROJECT_ROOT}")

# --- Params (Jetzt relativ zum PROJECT_ROOT) ---
TARGET_PATH   = PROJECT_ROOT / "data/processed/target.csv"
INDEX_COL     = "date"
Y_COL         = "IP_change"
OUTPUT_PATH   = PROJECT_ROOT / "data/processed/tsfresh_slim.parquet"

WINDOW_SIZES  = [6, 12]
N_JOBS        = 4

# --- Imports ---
import pandas as pd
import numpy as np
from pathlib import Path

from tsfresh import extract_features
from tsfresh.feature_extraction.settings import EfficientFCParameters
from tsfresh.utilities.dataframe_functions import roll_time_series

# --- Load target & basic checks ---
def load_y(path, y_col, index_col="date"):
    path = str(path)
    if path.endswith(".csv"):
        df = pd.read_csv(path, index_col=index_col, parse_dates=True)
    elif path.endswith(".parquet"):
        df = pd.read_parquet(path)
        if index_col in df.columns:
            df[index_col] = pd.to_datetime(df[index_col])
            df = df.set_index(index_col)
    else:
        raise ValueError("Bitte .csv oder .parquet verwenden.")
    if not isinstance(df.index, pd.DatetimeIndex):
        raise ValueError("Target benötigt DatetimeIndex.")
    s = df[y_col] if (isinstance(df, pd.DataFrame) and y_col in df.columns) else df.iloc[:, 0]
    s = pd.to_numeric(s, errors="coerce")
    return s

y = load_y(TARGET_PATH, Y_COL, INDEX_COL).dropna()
idx = y.index

# --- Wide -> Long (eine Serie), Integer-Zeitachse ---
long = (
    y.to_frame(name="value")
     .reset_index()
     .rename(columns={INDEX_COL: "time"})
)
long["time"] = np.arange(len(long), dtype=np.int64)   # 0..n-1 (Integer-Zeit)
long["id"]   = f"target.{Y_COL}"
long = long[["time", "id", "value"]].sort_values(["id", "time"], kind="mergesort")

# --- Feature-Extraktion je Fenstergröße ---
blocks = []
for W in WINDOW_SIZES:
    print(f"[TSF] window={W}")
    # Rolling-Fenster: erzeugt Panel mit Fenstern der Länge W; Fensterende = jeweiliges t
    rolled = roll_time_series(
        long,
        column_id="id",
        column_sort="time",
        min_timeshift=W - 1,     # erfordert volle W-Länge
        max_timeshift=None       # alle möglichen Fenster
    )

    # Numerik absichern / lückenhafte Fenster verwerfen (kausal unkritisch)
    rolled["value"] = pd.to_numeric(rolled["value"], errors="coerce")
    rolled = rolled.dropna(subset=["value"])

    # TSFresh-Parameter (performant & robust)
    fc = EfficientFCParameters()

    feats = extract_features(
        rolled,
        column_id="id",
        column_sort="time",
        default_fc_parameters=fc,
        n_jobs=N_JOBS,
        disable_progressbar=True,
        pivot=True
    )

    # Fensterende (Integer-Zeit) pro rolled-id ermitteln und auf echten Zeitstempel mappen
    end_map = rolled.groupby("id", sort=False)["time"].max()     # Series: rolled-id -> end_time (int)
    feats["__end_time__"] = feats.index.to_series().map(end_map)

    # Nur gültige Fenster behalten und DatetimeIndex setzen (Index = t)
    feats = feats.dropna(subset=["__end_time__"])
    feats["__time__"] = feats["__end_time__"].astype(int).map(
        lambda t: idx[t] if (0 <= t < len(idx)) else pd.NaT
    )
    feats = (
        feats.drop(columns=["__end_time__"])
             .dropna(subset=["__time__"])
             .set_index("__time__")
             .sort_index()
    )

    # Präfix & Typ; Spalten konsistent sortieren
    feats = feats.add_prefix(f"tsf_w{W}__")
    feats = feats.apply(pd.to_numeric, errors="coerce").astype("float32")
    feats = feats.reindex(sorted(feats.columns), axis=1)

    blocks.append(feats)

# --- Zusammenführen & auf volle Zielachse reindexen ---
if blocks:
    OUT = pd.concat(blocks, axis=1).sort_index()
else:
    OUT = pd.DataFrame(index=idx)

OUT = OUT.reindex(idx)                       # volle Zeitachse, frühe Monate ggf. NaN (erwartet)
OUT = OUT.astype("float32")

# Optional: Spalten alphabetisch sortieren (stabile Ordnung)
OUT = OUT.reindex(sorted(OUT.columns), axis=1)

# --- Save ---
Path(OUTPUT_PATH).parent.mkdir(parents=True, exist_ok=True)
OUT.to_parquet(OUTPUT_PATH)

# --- Diagnostics ---
wmax = max(WINDOW_SIZES) if WINDOW_SIZES else 0
valid_slice = OUT.iloc[wmax:] if wmax > 0 else OUT
nnr = float(valid_slice.notna().mean().mean()) if len(valid_slice) else float("nan")
print(f"[TSF] wrote: {OUTPUT_PATH}  shape={OUT.shape}  non-null ratio after max window≈{nnr:.3f}")

# Hinweise:
# - Kein zusätzlicher shift() beim späteren Merge: Index t ist bereits korrekt für Prognose von y_{t+1}.
# - Standardisierung/Screening/DR in der Haupt-Pipeline strikt train-only fitten.
# - Fehlende Werte am Anfang sind durch minimale Fensterlängen bedingt (z. B. W=12 -> erste 11 Zeilen NaN).


PROJECT_ROOT gefunden in: /Users/jonasschernich/Documents/Masterarbeit/Code
[TSF] window=6


Rolling: 100%|██████████| 20/20 [00:02<00:00,  8.37it/s]


[TSF] window=12


Rolling: 100%|██████████| 20/20 [00:02<00:00,  9.79it/s]


[TSF] wrote: /Users/jonasschernich/Documents/Masterarbeit/Code/data/processed/tsfresh_slim_test.parquet  shape=(407, 1554)  non-null ratio after max window≈0.883
