# TSFresh Slim Precompute


In [1]:
# --- Imports ---
from pathlib import Path

import numpy as np
import pandas as pd
from tsfresh import extract_features
from tsfresh.feature_extraction.settings import EfficientFCParameters
from tsfresh.utilities.dataframe_functions import roll_time_series


# --- 1) Robust path setup ---
def _locate_repo_root(start: Path) -> Path:
    """Find the project root by looking for 'src' or 'data' directories."""
    cur = start.resolve()
    for _ in range(5):  # search up to 5 levels
        if (cur / "src").exists() or (cur / "data").exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent

    # Fallback: assume we're 2 levels deep (common notebook layout)
    print("Warning: Could not find 'src' or 'data' folder. Guessing project root.")
    return start.resolve().parent.parent


NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = _locate_repo_root(NOTEBOOK_DIR)
print(f"PROJECT_ROOT found at: {PROJECT_ROOT}")

# --- Params (relative to PROJECT_ROOT) ---
TARGET_PATH = PROJECT_ROOT / "data/processed/target.csv"
INDEX_COL = "date"
Y_COL = "IP_change"
OUTPUT_PATH = PROJECT_ROOT / "data/processed/tsfresh_slim.parquet"

WINDOW_SIZES = [6, 12]
N_JOBS = 4


# --- Load target & basic checks ---
def load_y(path, y_col, index_col="date"):
    """Load target series from .csv or .parquet and return a numeric Series with DatetimeIndex."""
    path = str(path)
    if path.endswith(".csv"):
        df = pd.read_csv(path, index_col=index_col, parse_dates=True)
    elif path.endswith(".parquet"):
        df = pd.read_parquet(path)
        if index_col in df.columns:
            df[index_col] = pd.to_datetime(df[index_col])
            df = df.set_index(index_col)
    else:
        raise ValueError("Please use a .csv or .parquet file.")

    if not isinstance(df.index, pd.DatetimeIndex):
        raise ValueError("Target must have a DatetimeIndex.")

    s = df[y_col] if (isinstance(df, pd.DataFrame) and y_col in df.columns) else df.iloc[:, 0]
    s = pd.to_numeric(s, errors="coerce")
    return s


y = load_y(TARGET_PATH, Y_COL, INDEX_COL).dropna()
idx = y.index

# --- Wide -> long (single series), integer time axis ---
long = y.to_frame(name="value").reset_index().rename(columns={INDEX_COL: "time"})
long["time"] = np.arange(len(long), dtype=np.int64)  # 0..n-1 (integer time)
long["id"] = f"target.{Y_COL}"
long = long[["time", "id", "value"]].sort_values(["id", "time"], kind="mergesort")

# --- Feature extraction per window size ---
blocks = []
for W in WINDOW_SIZES:
    print(f"[TSF] window={W}")

    # Build rolling windows of length W (window end = current t)
    rolled = roll_time_series(
        long,
        column_id="id",
        column_sort="time",
        min_timeshift=W - 1,  # require full window length
        max_timeshift=None,   # all possible windows
    )

    # Ensure numeric and drop incomplete windows
    rolled["value"] = pd.to_numeric(rolled["value"], errors="coerce")
    rolled = rolled.dropna(subset=["value"])

    # TSFresh parameters (fast & robust defaults)
    fc = EfficientFCParameters()

    feats = extract_features(
        rolled,
        column_id="id",
        column_sort="time",
        default_fc_parameters=fc,
        n_jobs=N_JOBS,
        disable_progressbar=True,
        pivot=True,
    )

    # Map each rolled window id to its integer end time, then to the real timestamp
    end_map = rolled.groupby("id", sort=False)["time"].max()  # rolled-id -> end_time (int)
    feats["__end_time__"] = feats.index.to_series().map(end_map)

    feats = feats.dropna(subset=["__end_time__"])
    feats["__time__"] = feats["__end_time__"].astype(int).map(
        lambda t: idx[t] if (0 <= t < len(idx)) else pd.NaT
    )
    feats = (
        feats.drop(columns=["__end_time__"])
        .dropna(subset=["__time__"])
        .set_index("__time__")
        .sort_index()
    )

    # Prefix, dtype, and stable column order
    feats = feats.add_prefix(f"tsf_w{W}__")
    feats = feats.apply(pd.to_numeric, errors="coerce").astype("float32")
    feats = feats.reindex(sorted(feats.columns), axis=1)

    blocks.append(feats)

# --- Merge and reindex to full target timeline ---
OUT = pd.concat(blocks, axis=1).sort_index() if blocks else pd.DataFrame(index=idx)
OUT = OUT.reindex(idx)  # full axis; early rows may be NaN (expected)
OUT = OUT.astype("float32")

# Stable column order
OUT = OUT.reindex(sorted(OUT.columns), axis=1)

# Shift by 1 to keep features strictly lagged vs target
OUT = OUT.shift(1)

# --- Save ---
Path(OUTPUT_PATH).parent.mkdir(parents=True, exist_ok=True)
OUT.to_parquet(OUTPUT_PATH)

# --- Diagnostics ---
wmax = max(WINDOW_SIZES) if WINDOW_SIZES else 0
valid_slice = OUT.iloc[wmax:] if wmax > 0 else OUT
nnr = float(valid_slice.notna().mean().mean()) if len(valid_slice) else float("nan")
print(f"[TSF] wrote: {OUTPUT_PATH}  shape={OUT.shape}  non-null ratio after max window≈{nnr:.3f}")


PROJECT_ROOT gefunden in: /Users/jonasschernich/Documents/Masterarbeit/Code
[TSF] window=6


Rolling: 100%|██████████| 20/20 [00:01<00:00, 10.31it/s]
Process SpawnPoolWorker-8:
Process SpawnPoolWorker-5:
Process SpawnPoolWorker-6:
Process SpawnPoolWorker-7:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/tsfresh/utilities/distribution.py", line 43, in _function_with_partly_reduce
    results = list(itertools.chain.from_iterable(results))
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "

KeyboardInterrupt: 