In [2]:
# --- Imports ---
import pandas as pd
import numpy as np
from pathlib import Path
import os, sys

from statsmodels.tsa.ar_model import AutoReg


# --- 1) Robust path setup (similar to the TSFresh notebook) ---
def _locate_repo_root(start: Path) -> Path:
    """Find the project root by searching for a 'src' or 'data' folder."""
    cur = start.resolve()
    for _ in range(5):  # Search up to 5 levels up
        if (cur / "src").exists() or (cur / "data").exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent

    # Fallback: assume we are 2 levels deep (common notebook layout)
    print("Warning: Could not find 'src' or 'data' folder. Guessing project root.")
    return start.resolve().parent.parent


NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = _locate_repo_root(NOTEBOOK_DIR)
print(f"PROJECT_ROOT found at: {PROJECT_ROOT}")

# --- Params (relative to PROJECT_ROOT) ---
TARGET_PATH = PROJECT_ROOT / "data/processed/target.csv"
INDEX_COL = "date"
Y_COL = "IP_change"
OUTPUT_PATH = PROJECT_ROOT / "data/processed/AR.parquet"
INITIAL_WINDOW = 6  # first 6 months (incl. NA) for the initial forecast


# --- Load target & basic checks (similar to your load_y logic) ---
def load_y(path, y_col, index_col="date"):
    path = str(path)
    if path.endswith(".csv"):
        df = pd.read_csv(path, index_col=index_col, parse_dates=True)
    elif path.endswith(".parquet"):
        df = pd.read_parquet(path)
        if index_col in df.columns:
            df[index_col] = pd.to_datetime(df[index_col])
            df = df.set_index(index_col)
    else:
        raise ValueError("Please provide a .csv or .parquet file.")

    if not isinstance(df.index, pd.DatetimeIndex):
        raise ValueError("Target requires a DatetimeIndex.")

    s = df[y_col] if (isinstance(df, pd.DataFrame) and y_col in df.columns) else df.iloc[:, 0]
    s = pd.to_numeric(s, errors="coerce")  # keep NA values as NaN
    return s


print(f"Loading target from: {TARGET_PATH}")
y = load_y(TARGET_PATH, Y_COL, INDEX_COL)
idx = y.index
m = len(y)
print(f"Number of observations in target: {m}")

# --- AR(1) rolling / expanding window ---
rows = []

# j is the target month index (0-based). Start at INITIAL_WINDOW, i.e.:
# first forecast is for idx[INITIAL_WINDOW] (month 7 in 0-based counting),
# trained on y[0:INITIAL_WINDOW] (first 6 months; initial NA is dropped).
for j in range(INITIAL_WINDOW, m):
    # Training data: all months before month j
    train = y.iloc[:j].dropna()  # initial NA is removed here

    # Need at least 2 observations for AR(1)
    if len(train) < 2:
        print(f"[AR(1)] Skipping j={j} (too few data points after dropna: {len(train)})")
        continue

    # AR(1) model: y_t = const + phi * y_{t-1} + e_t
    model = AutoReg(train, lags=1, old_names=False)
    res = model.fit()

    # One-step-ahead forecast
    # AutoReg uses an internal 0..len(train)-1 index -> start=end=len(train) is 1-step ahead
    pred = float(res.predict(start=len(train), end=len(train))[0])

    # Forecast is for month j -> date idx[j]
    ts = idx[j]
    rows.append((ts, pred))

    if len(rows) % 50 == 0:
        print(f"[AR(1)] Step {len(rows)} -> forecast for {ts}: {pred:.6f}")

print(f"[AR(1)] Done. Number of forecasts generated: {len(rows)}")

# --- Build DataFrame & reindex to full timeline ---
AR_DF = pd.DataFrame(rows, columns=["date", "AR1_pred"]).set_index("date")

# Align to the full target index; early months without forecasts remain NaN
AR_DF = AR_DF.reindex(idx).astype("float32")

# Format date index as YYYY-MM-DD (human-readable, like in Chronos)
AR_DF.index = AR_DF.index.strftime("%Y-%m-%d")

# --- Save ---
Path(OUTPUT_PATH).parent.mkdir(parents=True, exist_ok=True)
AR_DF.to_parquet(OUTPUT_PATH)

print(f"[AR(1)] Wrote: {OUTPUT_PATH}  shape={AR_DF.shape}")
print(AR_DF.head(15))



PROJECT_ROOT gefunden in: /Users/jonasschernich/Documents/Masterarbeit/Code
Lade Target aus: /Users/jonasschernich/Documents/Masterarbeit/Code/data/processed/target.csv
Anzahl Beobachtungen im Target: 408
[AR(1)] Schritt 50 -> Vorhersage für 1995-08-01 00:00:00: -0.036070
[AR(1)] Schritt 100 -> Vorhersage für 1999-10-01 00:00:00: 0.126957


  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, fr

[AR(1)] Schritt 150 -> Vorhersage für 2003-12-01 00:00:00: -0.210098
[AR(1)] Schritt 200 -> Vorhersage für 2008-02-01 00:00:00: -0.330406


  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend

[AR(1)] Schritt 250 -> Vorhersage für 2012-04-01 00:00:00: -0.138557


  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend

[AR(1)] Schritt 300 -> Vorhersage für 2016-06-01 00:00:00: 0.316919


  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend

[AR(1)] Schritt 350 -> Vorhersage für 2020-08-01 00:00:00: 0.039278
[AR(1)] Schritt 400 -> Vorhersage für 2024-10-01 00:00:00: 0.158664
[AR(1)] Fertig. Anzahl erzeugter Vorhersagen: 402
[AR(1)] wrote: /Users/jonasschernich/Documents/Masterarbeit/Code/data/processed/AR.parquet  shape=(408, 1)
            AR1_pred
date                
1991-01-01       NaN
1991-02-01       NaN
1991-03-01       NaN
1991-04-01       NaN
1991-05-01       NaN
1991-06-01       NaN
1991-07-01 -2.353891
1991-08-01  0.813373
1991-09-01 -0.003668
1991-10-01 -0.514414
1991-11-01 -0.975647
1991-12-01 -0.163460
1992-01-01 -0.020763
1992-02-01 -0.083696
1992-03-01  0.173998


  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend