In [1]:
# Setup: Keep the notebook reproducible and self-contained; no hidden state across runs.
from __future__ import annotations

from pathlib import Path
import sys
import warnings
import logging
from typing import Iterable, Mapping

import yaml
import pandas as pd

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger("features_build")

def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p/"config").exists() and (p/"src").exists():
            return p
    raise AssertionError("Project folders missing: expected 'config' and 'src' somewhere above the notebook.")

ROOT = find_project_root(Path.cwd())
assert (ROOT / "config").exists() and (ROOT / "src").exists(), "Project folders missing."
print("Project root:", ROOT)

root_str = str(ROOT)
if root_str not in sys.path:
    sys.path.append(root_str)

# Import the feature builder.
from src.features.feature_engineering import FeatureSpec, build_feature_matrix

Project root: C:\Users\gamer\Desktop\AktienPrognose


In [2]:
# Helpers
def _load_yaml(cfg_path: Path) -> dict:
    if not cfg_path.exists():
        raise FileNotFoundError(f"Config not found: {cfg_path}")
    try:
        with cfg_path.open("r", encoding="utf-8") as f:
            return yaml.safe_load(f) or {}
    except Exception as e:
        raise RuntimeError(f"Failed to parse YAML: {cfg_path}") from e


def _require_keys(obj: Mapping, keys: Iterable[str], ctx: str) -> None:
    for k in keys:
        if k not in obj:
            raise KeyError(f"Missing key '{k}' in {ctx}")


def _ensure_datetime_index_sorted(df: pd.DataFrame) -> pd.DataFrame:
    """Validate DatetimeIndex and return a sorted copy.
    Args: df: Input DataFrame.
    Returns: Sorted DataFrame.
    Raises: TypeError: If index is not DatetimeIndex."""
    if not isinstance(df.index, pd.DatetimeIndex):
        raise TypeError("Index must be DatetimeIndex.")
    return df if df.index.is_monotonic_increasing else df.sort_index()


def _safe_write_parquet_csv(df: pd.DataFrame, pq_path: Path, csv_path: Path) -> None:
    """Persist DataFrame as Parquet and CSV with robust error context.
    Args:
        df: DataFrame to persist.
        pq_path: Parquet destination path.
        csv_path: CSV destination path.
    Raises: RuntimeError: If any write fails."""
    pq_path.parent.mkdir(parents=True, exist_ok=True)
    try:
        df.to_parquet(pq_path)
    except Exception as e:
        raise RuntimeError(f"Failed to write Parquet to {pq_path}: {e}") from e
    try:
        df.to_csv(csv_path, index=True)
    except Exception as e:
        raise RuntimeError(f"Failed to write CSV to {csv_path}: {e}") from e

In [3]:
# Config
CFG_PATH: Path = ROOT / "config" / "data_config.yaml"
cfg: dict = _load_yaml(CFG_PATH)

_require_keys(cfg, ["dataset"], "config/data_config.yaml")
_require_keys(cfg["dataset"], ["start_date", "end_date"], "config.dataset")

start, end = cfg["dataset"]["start_date"], cfg["dataset"]["end_date"]
print("Config window:", start, "→", end)

Config window: 2008-01-01 → 2025-06-30


In [4]:
# Load raw
RAW = ROOT / "artifacts" / "data" / "raw_data.parquet"
if not RAW.exists():
    raise FileNotFoundError(f"Raw parquet not found: {RAW}. Run the data download step first.")

df_raw = pd.read_parquet(RAW)
df_raw = _ensure_datetime_index_sorted(df_raw)

print("Raw shape:", df_raw.shape)
print("Raw range:", str(df_raw.index.min())[:10], "→", str(df_raw.index.max())[:10])
print("Columns:", list(df_raw.columns)[:20], "..." if len(df_raw.columns) > 20 else "")  # WHY: Trim long lists.

Raw shape: (210, 10)
Raw range: 2008-01-31 → 2025-06-30
Columns: ['SP500', 'FedFundsRate', 'CPI', 'UnemploymentRate', 'VIX', 'EPU_US', 'FSI', 'Gold_USD_oz', 'WTI_Spot', 'USD_per_EUR'] 


In [5]:
# Build features: FeatureSpec must match column names from loader.
spec = FeatureSpec(
    market_col="SP500",
    cpi_col="CPI",
    fedfunds_col="FedFundsRate",
    unemployment_col="UnemploymentRate",
    vix_col="VIX",
    epu_col="EPU_US",
    fsi_col="FSI",
    gold_col="Gold_USD_oz",
    wti_col="WTI_Spot",
    usdeur_col="USD_per_EUR",
)

# Build leakage-safe features & next-month targets (uses only info available at t)
features = build_feature_matrix(df_raw, spec=spec)

# Quick diagnostics from feature builder (dropped rows due to rolling/targets).
print("Features shape:", features.shape)
print("Dropped rows due to windows/targets:", features.attrs.get("dropped_rows"))

# Preview tail for sanity (month-end alignment and target alignment).
features.tail(3)

Features shape: (196, 16)
Dropped rows due to windows/targets: 14


Unnamed: 0,Return_Lag1,3M_SMA_Return,12M_SMA_Return,3M_Momentum,Volatility_6M,FedFunds_Delta_bps,Inflation_YoY_pct,UnemploymentRate,VIX,EPU_US,FSI,Gold_USD_oz,WTI_Spot,USD_per_EUR,y_return_next_pct,y_direction_next
2025-03-31,-1.424209,-1.49235,0.60894,-4.58682,4.041461,0.0,2.81427,4.1,22.28,228.62756,-0.4622,275.387142,68.24,1.0796,-0.762494,0
2025-04-30,-5.75447,-2.647058,0.892191,-7.805106,4.035583,0.0,2.405585,4.2,24.7,296.92294,0.045375,296.956668,63.54,1.1349,6.152383,1
2025-05-31,-0.762494,-0.121527,1.004713,-0.718953,4.164134,0.0,2.333747,4.2,18.57,296.92294,-0.5507,302.891427,62.17,1.1347,4.421407,1


In [6]:
# Validate persist: Final sanity checks prior to persistence.
if features.empty:
    raise RuntimeError("Empty features frame — check input coverage and config column names.")
if not features.columns.is_unique:
    dupes = pd.Series(features.columns).value_counts()
    dupes = dupes[dupes > 1].index.tolist()
    raise RuntimeError(f"Duplicate columns in features: {dupes}")

# Time-window consistency with config guardrails (inclusive).
if features.index.min() < pd.to_datetime(start):
    raise RuntimeError("Features begin before configured start date.")
if features.index.max() > pd.to_datetime(end):
    raise RuntimeError("Features extend beyond configured end date.")

# Quick target visibility (only if present).
target_cols = [c for c in (spec.y_ret_next, spec.y_dir_next) if c in features.columns]
if target_cols:
    print(features[target_cols].tail(5))

# Persist artifacts (Parquet + CSV for quick inspection).
OUT_PQ = ROOT / "artifacts" / "data" / "features_monthly.parquet"
OUT_CSV = ROOT / "artifacts" / "data" / "features_monthly.csv"
_safe_write_parquet_csv(features, OUT_PQ, OUT_CSV)
print("Saved features →", OUT_PQ, "and", OUT_CSV)


            y_return_next_pct  y_direction_next
2025-01-31          -1.424209                 0
2025-02-28          -5.754470                 0
2025-03-31          -0.762494                 0
2025-04-30           6.152383                 1
2025-05-31           4.421407                 1
Saved features → C:\Users\gamer\Desktop\AktienPrognose\artifacts\data\features_monthly.parquet and C:\Users\gamer\Desktop\AktienPrognose\artifacts\data\features_monthly.csv


In [7]:
# Report: Compact final summary for the run log.
feature_only_cols = [c for c in features.columns if c not in (spec.y_ret_next, spec.y_dir_next)]
print("Feature columns:", feature_only_cols)
print("Period:", str(features.index.min())[:10], "→", str(features.index.max())[:10])

# Show robust percentiles of the regression target to catch obvious scaling issues.
if spec.y_ret_next in features.columns:
    desc = features[spec.y_ret_next].describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95])
    print("Target stats:\n", desc.to_string())
else:
    logger.warning(f"Target column '{spec.y_ret_next}' not found in features.")

Feature columns: ['Return_Lag1', '3M_SMA_Return', '12M_SMA_Return', '3M_Momentum', 'Volatility_6M', 'FedFunds_Delta_bps', 'Inflation_YoY_pct', 'UnemploymentRate', 'VIX', 'EPU_US', 'FSI', 'Gold_USD_oz', 'WTI_Spot', 'USD_per_EUR']
Period: 2009-02-28 → 2025-05-31
Target stats:
 count    196.000000
mean       1.180647
std        4.229466
min      -12.511928
5%        -6.343248
25%       -1.443139
50%        1.787186
75%        3.623017
95%        7.897915
max       12.684404


In [8]:
# Results:
from IPython.display import Markdown, display
display(Markdown("""
**Project Checklist**
- [x] Data loaded (`artifacts/data/raw_data.parquet`)
- [x] Raw audit (`01_data_raw_audit.ipynb`)
- [x] Build features (`02_build_features_monthly.ipynb`)
- [ ] EDA (`03_eda.ipynb`)
- [ ] Baselines (`10_train_eval_baselines.ipynb`)
- [ ] Linear models (`20_train_eval_linear.ipynb`)
- [ ] ARIMA (`21_train_eval_arima.ipynb`)
- [ ] Random Forest (`22_train_eval_random_forest.ipynb`)
- [ ] LSTM (`23_train_eval_lstm.ipynb`)
- [ ] Ensembles (`90_ensembles.ipynb`)
- [ ] Final report (`99_report_reproducibility.ipynb`)
"""))


**Project Checklist**
- [x] Data loaded (`artifacts/data/raw_data.parquet`)
- [x] Raw audit (`01_data_raw_audit.ipynb`)
- [x] Build features (`02_build_features_monthly.ipynb`)
- [ ] EDA (`03_eda.ipynb`)
- [ ] Baselines (`10_train_eval_baselines.ipynb`)
- [ ] Linear models (`20_train_eval_linear.ipynb`)
- [ ] ARIMA (`21_train_eval_arima.ipynb`)
- [ ] Random Forest (`22_train_eval_random_forest.ipynb`)
- [ ] LSTM (`23_train_eval_lstm.ipynb`)
- [ ] Ensembles (`90_ensembles.ipynb`)
- [ ] Final report (`99_report_reproducibility.ipynb`)
