In [1]:
# Setup: Top-cell setup keeps the notebook reproducible and idempotent across runs.
from __future__ import annotations

from pathlib import Path  # Pathlib paths are safer/cross-platform versus raw strings.
import sys  # We'll adjust sys.path if needed to import local packages from 'src'.
import importlib  # Lightweight smoke-tests for optional/runtime dependencies.
import warnings  # Downstream libs can be noisy thus suppress to keep output readable.
import logging  # Logging > prints for structured status.
from typing import Iterable  # For type hints on small helpers.

import yaml  # Config-driven pipeline.
import pandas as pd  # WHY: Core data structure for time series and IO.
import numpy as np  # WHY: Ensure NumPy is available; some ops rely on it.

# Minimal, readable logging configuration for interactive work.
warnings.filterwarnings("ignore")  # Hide known-but-noncritical warnings in data pulls.
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger("setup")

def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p/"config").exists() and (p/"src").exists():
            return p
    raise AssertionError("Project folders missing: expected 'config' and 'src' somewhere above the notebook.")

ROOT = find_project_root(Path.cwd())
assert (ROOT / "config").exists() and (ROOT / "src").exists(), "Project folders missing."
print("Project root:", ROOT)  # Quick sanity check for cwd-based path logic.

# Ensure standard artifact locations exist (clear separation of outputs).
ARTIFACT_DIRS: tuple[str, ...] = (
    "artifacts/data",
    "artifacts/models",
    "artifacts/forecasts",
    "artifacts/metrics",
    "artifacts/reports",
    "artifacts/tmp",
)
for rel in ARTIFACT_DIRS:
    (ROOT / rel).mkdir(parents=True, exist_ok=True)  # Idempotent directory creation.

# Core libs test.
print("pandas", pd.__version__, "| numpy", np.__version__)

# Critical dependencies we rely on later.
def _smoke_imports(pkgs: Iterable[str]) -> None:
    """Import packages to fail fast if a dependency is missing.
    Args: pkgs: Iterable of package names to import.
    Raises: RuntimeError: If any import fails."""

    # Prefer early, explicit dependency failures.
    for pkg in pkgs:
        try:
            importlib.import_module(pkg)
            print(f"OK: {pkg}")
        except Exception as e:
            raise RuntimeError(
                f"Missing or broken dependency: {pkg}. Install it and retry."
            ) from e

_smoke_imports(["yfinance", "pandas_datareader", "pyarrow"])

# Filesystem writability.
tmpfile = ROOT / "artifacts" / "tmp" / "write_check.txt"
try:
    tmpfile.write_text("ok", encoding="utf-8")
    print("FS write check: OK")
finally:
    if tmpfile.exists():
        tmpfile.unlink()

Project root: C:\Users\gamer\Desktop\AktienPrognose
pandas 2.3.1 | numpy 2.1.3
OK: yfinance
OK: pandas_datareader
OK: pyarrow
FS write check: OK


In [2]:
# Helpers
from pprint import pprint as pp 

def _load_config(cfg_path: Path) -> dict:
    """Load YAML config with crisp error messages.
    Args: cfg_path: Path to YAML configuration.
    Returns: Dict with configuration data (empty dict if file is blank).
    Raises:
        FileNotFoundError: If the path does not exist.
        RuntimeError: If YAML parsing fails."""

    # Centralized loader makes error handling uniform and testable.
    if not cfg_path.exists():
        raise FileNotFoundError(f"Config not found: {cfg_path}")
    try:
        with cfg_path.open("r", encoding="utf-8") as f:
            return yaml.safe_load(f) or {}
    except Exception as e:
        raise RuntimeError(f"Failed to parse YAML: {cfg_path}") from e


def _require_keys(obj: dict, keys: Iterable[str], ctx: str) -> None:
    """Fail fast if required keys are missing to avoid cryptic errors later.
    Args:
        obj: Mapping to validate.
        keys: Required key names.
    Raises: KeyError: If any key is missing."""

    # Guardrails make subsequent code simpler and reduce nested conditionals.
    for k in keys:
        if k not in obj:
            raise KeyError(f"Missing key '{k}' in {ctx}")


def _assert_guardrail_dates(ds_cfg: dict) -> None:
    """Enforce expected date window for this project.
    Args: ds_cfg: The 'dataset' sub-config.
    Raises: AssertionError: If dates differ from the guardrails."""

    # This project expects a fixed backtest window.
    print("Config window:", ds_cfg["start_date"], "→", ds_cfg["end_date"])
    assert ds_cfg["start_date"] == "2008-01-01"
    assert ds_cfg["end_date"] == "2025-06-30"
    print("Guardrails OK (2008-01-01 .. 2025-06-30)")

In [3]:
# Config
CFG_PATH: Path = ROOT / "config" / "data_config.yaml"
cfg_data: dict = _load_config(CFG_PATH)

# Minimal schema checks
_require_keys(cfg_data, ["dataset"], "config/data_config.yaml")
_require_keys(cfg_data["dataset"], ["start_date", "end_date"], "config.dataset")

# Show/validate the configured window and sources.
_assert_guardrail_dates(cfg_data["dataset"])
pp(cfg_data.get("dataset", {}))

Config window: 2008-01-01 → 2025-06-30
Guardrails OK (2008-01-01 .. 2025-06-30)
{'end_date': '2025-06-30',
 'equities': [{'agg': 'last',
               'name': 'SP500',
               'source': 'yahoo',
               'symbol': '^GSPC'}],
 'macro': [{'agg': 'last',
            'name': 'FedFundsRate',
            'series_id': 'FEDFUNDS',
            'source': 'fred'},
           {'agg': 'last',
            'name': 'CPI',
            'series_id': 'CPIAUCSL',
            'source': 'fred'},
           {'agg': 'last',
            'name': 'UnemploymentRate',
            'series_id': 'UNRATE',
            'source': 'fred'},
           {'agg': 'last',
            'name': 'VIX',
            'series_id': 'VIXCLS',
            'source': 'fred'},
           {'agg': 'mean',
            'name': 'EPU_US',
            'series_id': 'USEPUINDXM',
            'source': 'fred'},
           {'agg': 'mean',
            'name': 'FSI',
            'series_id': 'STLFSI4',
            'source': 'fred'},
       

In [4]:
# Download: Download only if the unified raw parquet is missing; otherwise keep things fast and reproducible.
RAW_PATH: Path = ROOT / "artifacts" / "data" / "raw_data.parquet"

if not RAW_PATH.exists():
    print("No raw_data.parquet found → starting download...")
    # Ensure 'src' is importable.
    src_path = str(ROOT)
    if src_path not in sys.path:  # Avoid duplicating sys.path entries across reruns.
        sys.path.append(src_path)

    try:
        from src.data.download_data import download_all_data
    except Exception as e:
        raise ImportError(
            "Cannot import 'download_all_data' from src.data.download_data. "
            "Verify that 'src' is present and importable."
        ) from e

    try:
        download_all_data(config_path=CFG_PATH, out_path=RAW_PATH)
    except Exception as e:
        raise RuntimeError(f"Data download failed: {e}") from e
else:
    print("Raw data present:", RAW_PATH)

INFO: [EQ] SP500: Yahoo ^GSPC 2008-01-01..2025-06-30 (agg=last)


No raw_data.parquet found → starting download...


INFO: [MACRO] FedFundsRate: FRED FEDFUNDS 2008-01-01..2025-06-30 (agg=last)
INFO: [MACRO] CPI: FRED CPIAUCSL 2008-01-01..2025-06-30 (agg=last)
INFO: [MACRO] UnemploymentRate: FRED UNRATE 2008-01-01..2025-06-30 (agg=last)
INFO: [MACRO] VIX: FRED VIXCLS 2008-01-01..2025-06-30 (agg=last)
INFO: [MACRO] EPU_US: FRED USEPUINDXM 2008-01-01..2025-06-30 (agg=mean)
INFO: [MACRO] FSI: FRED STLFSI4 2008-01-01..2025-06-30 (agg=mean)
INFO: [MACRO] Gold_USD_oz: FRED GOLDAMGBD228NLBM 2008-01-01..2025-06-30 (agg=mean)
Response Text:
b'<!DOCTYPE html>\r\n<html lang="en">\r\n<head>\r\n    <meta charset="utf-8">\r\n    <meta http-equiv="X-UA-Compatible" content="IE=edge">\r\n    <meta name="viewport" content="width=device-width, initial-scale=1">\r\n    <title>Error - St. Louis Fed</title>\r\n    <meta name="description" content="">\r\n    <meta name="keywords" content="">    \r\n    <link rel="stylesheet" type="text/css" href="/assets/bootstrap/dist/css/bootstrap.min.css">\r\n    <link rel="stylesheet" t

In [5]:
# Integrity: Structural checks catch common leakage/index issues before modeling.
df = pd.read_parquet(RAW_PATH)

# Quick overview to anchor expectations.
min_idx, max_idx = str(df.index.min())[:10], str(df.index.max())[:10]
print("Shape:", df.shape, "| Range:", min_idx, "→", max_idx)

# Hard bounds to confirm leakage guard.
assert isinstance(df.index, pd.DatetimeIndex), "Index must be DatetimeIndex."  # Time ops assume DatetimeIndex.
assert min_idx >= "2008-01-01", "Index starts before allowed window."  # Prevent look-behind leakage.
assert max_idx <= "2025-06-30", "Index extends beyond allowed window."  # Prevent look-ahead leakage.

# Ensure deterministic ordering for reproducible downstream splits.
if not df.index.is_monotonic_increasing:
    df = df.sort_index()

# Quick visibility
print("Example columns:", list(df.columns)[:10])
na_rate = df.isna().mean().sort_values(ascending=False).head(8)
print("Top-8 missing rates:\n", na_rate)
df.tail(3)  # Display last few rows to quickly spot month-end alignment.

Shape: (210, 10) | Range: 2008-01-31 → 2025-06-30
Example columns: ['SP500', 'FedFundsRate', 'CPI', 'UnemploymentRate', 'VIX', 'EPU_US', 'FSI', 'Gold_USD_oz', 'WTI_Spot', 'USD_per_EUR']
Top-8 missing rates:
 EPU_US              0.014286
SP500               0.000000
FedFundsRate        0.000000
CPI                 0.000000
UnemploymentRate    0.000000
VIX                 0.000000
FSI                 0.000000
Gold_USD_oz         0.000000
dtype: float64


Unnamed: 0,SP500,FedFundsRate,CPI,UnemploymentRate,VIX,EPU_US,FSI,Gold_USD_oz,WTI_Spot,USD_per_EUR
2025-04-30,5569.060059,4.33,320.321,4.2,24.7,,0.04575,296.956668,63.54,1.1349
2025-05-31,5911.689941,4.33,320.58,4.2,18.57,,-0.55044,302.891427,62.17,1.1347
2025-06-30,6173.069824,4.33,321.5,4.1,16.73,,-0.737225,309.088421,68.17,1.177


In [6]:
# Results:
from IPython.display import Markdown, display
display(Markdown("""
**Project Checklist**
- [x] Data loaded (`artifacts/data/raw_data.parquet`)
- [ ] Raw audit (`01_data_raw_audit.ipynb`)
- [ ] Build features (`02_build_features_monthly.ipynb`)
- [ ] EDA (`03_eda.ipynb`)
- [ ] Baselines (`10_train_eval_baselines.ipynb`)
- [ ] Linear models (`20_train_eval_linear.ipynb`)
- [ ] ARIMA (`21_train_eval_arima.ipynb`)
- [ ] Random Forest (`22_train_eval_random_forest.ipynb`)
- [ ] LSTM (`23_train_eval_lstm.ipynb`)
- [ ] Ensembles (`90_ensembles.ipynb`)
- [ ] Final report (`99_report_reproducibility.ipynb`)
"""))


**Project Checklist**
- [x] Data loaded (`artifacts/data/raw_data.parquet`)
- [ ] Raw audit (`01_data_raw_audit.ipynb`)
- [ ] Build features (`02_build_features_monthly.ipynb`)
- [ ] EDA (`03_eda.ipynb`)
- [ ] Baselines (`10_train_eval_baselines.ipynb`)
- [ ] Linear models (`20_train_eval_linear.ipynb`)
- [ ] ARIMA (`21_train_eval_arima.ipynb`)
- [ ] Random Forest (`22_train_eval_random_forest.ipynb`)
- [ ] LSTM (`23_train_eval_lstm.ipynb`)
- [ ] Ensembles (`90_ensembles.ipynb`)
- [ ] Final report (`99_report_reproducibility.ipynb`)
