In [2]:
# %% [markdown]
# # AR(1) Rolling Forecast für IP_change
# - Pfad-Logik analog zum TSFresh-Notebook (PROJECT_ROOT, data/processed, ...)
# - lädt `data/processed/target.csv`
# - AR(1) mit expandierendem Fenster:
#   * erste 6 Monate (inkl. erstem NA) -> train (effektiv 5 Werte) -> predict Monat 7
#   * Monate 1..7 -> train -> predict Monat 8
#   * usw. bis zum Ende
# - Vorhersage für Monat t wird in der Zeile von Monat t gespeichert
# - schreibt `data/processed/AR.parquet`
# - Datumsindex im Output ist als menschenlesbarer String `YYYY-MM-DD` gespeichert

# %%
# --- Imports ---
import pandas as pd
import numpy as np
from pathlib import Path
import os, sys

from statsmodels.tsa.ar_model import AutoReg

# --- 1. Robuster Pfad-Setup (analog zu TSFresh-Notebook) ---
def _locate_repo_root(start: Path) -> Path:
    """Findet das Projektverzeichnis, indem es nach 'src' oder 'data' sucht."""
    cur = start.resolve()
    for _ in range(5):  # Bis zu 5 Ebenen nach oben suchen
        if (cur / 'src').exists() or (cur / 'data').exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    # Fallback: Nimm an, wir sind 2 Ebenen tief (Standard-Notebook-Ordner)
    print("Warnung: Konnte 'src' oder 'data' Ordner nicht finden. Rate Projekt-Root.")
    return start.resolve().parent.parent

NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = _locate_repo_root(NOTEBOOK_DIR)
print(f"PROJECT_ROOT gefunden in: {PROJECT_ROOT}")

# --- Params (relativ zum PROJECT_ROOT) ---
TARGET_PATH    = PROJECT_ROOT / "data/processed/target.csv"
INDEX_COL      = "date"
Y_COL          = "IP_change"
OUTPUT_PATH    = PROJECT_ROOT / "data/processed/AR.parquet"
INITIAL_WINDOW = 6  # erste 6 Monate (inkl. NA) für die Initial-Prognose

# --- Load target & basic checks (analog zu deiner load_y-Logik) ---
def load_y(path, y_col, index_col="date"):
    path = str(path)
    if path.endswith(".csv"):
        df = pd.read_csv(path, index_col=index_col, parse_dates=True)
    elif path.endswith(".parquet"):
        df = pd.read_parquet(path)
        if index_col in df.columns:
            df[index_col] = pd.to_datetime(df[index_col])
            df = df.set_index(index_col)
    else:
        raise ValueError("Bitte .csv oder .parquet verwenden.")
    if not isinstance(df.index, pd.DatetimeIndex):
        raise ValueError("Target benötigt DatetimeIndex.")
    s = df[y_col] if (isinstance(df, pd.DataFrame) and y_col in df.columns) else df.iloc[:, 0]
    s = pd.to_numeric(s, errors="coerce")  # NA in IP_change bleiben als NaN erhalten
    return s

print(f"Lade Target aus: {TARGET_PATH}")
y = load_y(TARGET_PATH, Y_COL, INDEX_COL)
idx = y.index
m = len(y)
print(f"Anzahl Beobachtungen im Target: {m}")

# --- AR(1) Rolling / Expandierendes Fenster ---
rows = []

# j ist der Index des Ziel-Monats (0-basiert); wir starten ab INITIAL_WINDOW,
# d.h. erste Vorhersage für idx[INITIAL_WINDOW] (Monat 7 bei 0-basierter Zählung),
# trainiert auf y[0:INITIAL_WINDOW] (erste 6 Monate, inkl. erstem NA -> dropna).
for j in range(INITIAL_WINDOW, m):
    # Trainingsdaten: alle Monate bis VOR Monat j
    train = y.iloc[:j].dropna()  # erster Monat (NA) wird hier entfernt

    # Mindestens 2 Beobachtungen für AR(1) nötig
    if len(train) < 2:
        print(f"[AR(1)] Überspringe j={j} (zu wenig Daten nach dropna: {len(train)})")
        continue

    # AR(1)-Modell: y_t = const + phi * y_{t-1} + e_t
    model = AutoReg(train, lags=1, old_names=False)
    res = model.fit()

    # Ein-Schritt-Vorhersage: 1 Schritt nach dem letzten Trainingspunkt
    # AutoReg indexiert intern 0..len(train)-1 -> start=end=len(train) = 1-Step-Ahead
    pred = float(res.predict(start=len(train), end=len(train))[0])

    # Vorhersage gilt für Monat j -> Datum idx[j]
    ts = idx[j]
    rows.append((ts, pred))

    if len(rows) % 50 == 0:
        print(f"[AR(1)] Schritt {len(rows)} -> Vorhersage für {ts}: {pred:.6f}")

print(f"[AR(1)] Fertig. Anzahl erzeugter Vorhersagen: {len(rows)}")

# --- DataFrame bauen & auf komplette Zeitachse reindizieren ---
AR_DF = pd.DataFrame(rows, columns=["date", "AR1_pred"]).set_index("date")

# Alle Monate auf voller Zielachse; frühe Monate ohne Forecast bleiben NaN
AR_DF = AR_DF.reindex(idx).astype("float32")

# Datumsindex in menschenlesbares Format wie bei Chronos (YYYY-MM-DD)
AR_DF.index = AR_DF.index.strftime("%Y-%m-%d")

# --- Save ---
Path(OUTPUT_PATH).parent.mkdir(parents=True, exist_ok=True)
AR_DF.to_parquet(OUTPUT_PATH)

print(f"[AR(1)] wrote: {OUTPUT_PATH}  shape={AR_DF.shape}")
print(AR_DF.head(15))


PROJECT_ROOT gefunden in: /Users/jonasschernich/Documents/Masterarbeit/Code
Lade Target aus: /Users/jonasschernich/Documents/Masterarbeit/Code/data/processed/target.csv
Anzahl Beobachtungen im Target: 408
[AR(1)] Schritt 50 -> Vorhersage für 1995-08-01 00:00:00: -0.036070
[AR(1)] Schritt 100 -> Vorhersage für 1999-10-01 00:00:00: 0.126957


  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, fr

[AR(1)] Schritt 150 -> Vorhersage für 2003-12-01 00:00:00: -0.210098
[AR(1)] Schritt 200 -> Vorhersage für 2008-02-01 00:00:00: -0.330406


  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend

[AR(1)] Schritt 250 -> Vorhersage für 2012-04-01 00:00:00: -0.138557


  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend

[AR(1)] Schritt 300 -> Vorhersage für 2016-06-01 00:00:00: 0.316919


  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend

[AR(1)] Schritt 350 -> Vorhersage für 2020-08-01 00:00:00: 0.039278
[AR(1)] Schritt 400 -> Vorhersage für 2024-10-01 00:00:00: 0.158664
[AR(1)] Fertig. Anzahl erzeugter Vorhersagen: 402
[AR(1)] wrote: /Users/jonasschernich/Documents/Masterarbeit/Code/data/processed/AR.parquet  shape=(408, 1)
            AR1_pred
date                
1991-01-01       NaN
1991-02-01       NaN
1991-03-01       NaN
1991-04-01       NaN
1991-05-01       NaN
1991-06-01       NaN
1991-07-01 -2.353891
1991-08-01  0.813373
1991-09-01 -0.003668
1991-10-01 -0.514414
1991-11-01 -0.975647
1991-12-01 -0.163460
1992-01-01 -0.020763
1992-02-01 -0.083696
1992-03-01  0.173998


  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend_index(index, steps, forecast_index)
  pred = float(res.predict(start=len(train), end=len(train))[0])
  self._init_dates(dates, freq)
  fcast_index = self._extend