In [1]:
from pathlib import Path
import sys, yaml
import pandas as pd

ROOT = Path.cwd()
assert (ROOT/"src").exists() and (ROOT/"artifacts/data").exists(), "Project structure missing."

# Make local package importable
sys.path.append(str(ROOT))

# Config (optional here, but useful to ensure naming alignment)
with open(ROOT/"config/data_config.yaml", "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f) or {}

start, end = cfg["dataset"]["start_date"], cfg["dataset"]["end_date"]
print("Config window:", start, "→", end)

# Import the feature builder
from src.features.feature_engineering import FeatureSpec, build_feature_matrix

In [2]:
RAW = ROOT/"artifacts/data/raw_data.parquet"
if not RAW.exists():
    raise FileNotFoundError(f"Raw parquet not found: {RAW}. Run 00_index first.")

df_raw = pd.read_parquet(RAW)
assert isinstance(df_raw.index, pd.DatetimeIndex), "Index must be DatetimeIndex."
if not df_raw.index.is_monotonic_increasing:
    df_raw = df_raw.sort_index()

print("Raw shape:", df_raw.shape)
print("Raw range:", str(df_raw.index.min())[:10], "→", str(df_raw.index.max())[:10])
print("Columns:", list(df_raw.columns))

Module erfolgreich importiert


In [None]:
# FeatureSpec must match your column names from the loader/config.
spec = FeatureSpec(
    market_col="SP500",       # from data_config.yaml (name for ^GSPC)
    cpi_col="CPI",
    fedfunds_col="FedFundsRate",
    unemployment_col="UnemploymentRate",
    vix_col="VIX",
    epu_col="EPU_US",
    fsi_col="FSI",
    gold_col="Gold_USD_oz",
    wti_col="WTI_Spot",
    usdeur_col="USD_per_EUR",
)

# Build leakage-safe features & next-month targets
features = build_feature_matrix(df_raw, spec=spec)
print("Features shape:", features.shape)
print("Dropped rows due to windows/targets:", features.attrs.get("dropped_rows"))
features.tail(3)

In [None]:
# Final sanity checks
assert features.index.min() >= pd.to_datetime(start)
assert features.index.max() <= pd.to_datetime(end)
assert features.columns.is_unique, "Duplicate columns in features."

# Quick target visibility
print(features[[spec.y_ret_next, spec.y_dir_next]].tail(5))

# Persist (parquet + optional CSV for quick inspection)
OUT_PQ = ROOT/"artifacts/data/features_monthly.parquet"
OUT_CSV = ROOT/"artifacts/data/features_monthly.csv"

features.to_parquet(OUT_PQ)
features.to_csv(OUT_CSV, index=True)
print("Saved features →", OUT_PQ, "and", OUT_CSV)

In [None]:
print("Feature columns:", [c for c in features.columns if c not in (spec.y_ret_next, spec.y_dir_next)])
print("Period:", str(features.index.min())[:10], "→", str(features.index.max())[:10])
print("Target stats:",
      features[spec.y_ret_next].describe(percentiles=[.05,.25,.5,.75,.95]).to_string())

In [None]:
# Results: Lightweight, human-readable project checklist to guide next steps without leaving the notebook.
from IPython.display import Markdown, display
display(Markdown("""
**Project Checklist**
- [x] Data loaded (`artifacts/data/raw_data.parquet`)
- [x] Raw audit (`01_data_raw_audit.ipynb`)
- [x] Build features (`02_build_features_monthly.ipynb`)
- [ ] EDA (`03_eda.ipynb`)
- [ ] Baselines (`10_train_eval_baselines.ipynb`)
- [ ] Linear models (`20_train_eval_linear.ipynb`)
- [ ] ARIMA (`21_train_eval_arima.ipynb`)
- [ ] Random Forest (`22_train_eval_random_forest.ipynb`)
- [ ] LSTM (`23_train_eval_lstm.ipynb`)
- [ ] Ensembles (`90_ensembles.ipynb`)
- [ ] Final report (`99_report_reproducibility.ipynb`)
"""))