In [1]:
# Setup: Centralized, idempotent setup makes the notebook runnable top-to-bottom with predictable state.
from __future__ import annotations

from pathlib import Path
import sys
import importlib
import warnings
import logging
from typing import Iterable, Mapping

import yaml
import pandas as pd
import numpy as np
from pprint import pprint as pp

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger("raw_audit")

def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p/"config").exists() and (p/"src").exists():
            return p
    raise AssertionError("Project folders missing: expected 'config' and 'src' somewhere above the notebook.")

ROOT = find_project_root(Path.cwd())
assert (ROOT / "config").exists() and (ROOT / "src").exists(), "Project folders missing."
print("Project root:", ROOT)

ARTIFACT_DIRS: tuple[str, ...] = (
    "artifacts/data",
    "artifacts/models",
    "artifacts/forecasts",
    "artifacts/metrics",
    "artifacts/reports",
    "artifacts/tmp",
)
for rel in ARTIFACT_DIRS:
    (ROOT / rel).mkdir(parents=True, exist_ok=True)

def _smoke_imports(pkgs: Iterable[str]) -> None:
    for pkg in pkgs:
        try:
            importlib.import_module(pkg)
            print(f"OK: {pkg}")
        except Exception as e:
            raise RuntimeError(f"Missing/broken dependency: {pkg}. Install and retry.") from e

_smoke_imports(["yfinance", "pandas_datareader", "pyarrow"])

tmp = ROOT / "artifacts" / "tmp" / "write_check.txt"
try:
    tmp.write_text("ok", encoding="utf-8")
    print("FS write check: OK")
finally:
    if tmp.exists():
        tmp.unlink()

Project root: C:\Users\gamer\Desktop\AktienPrognose
OK: yfinance
OK: pandas_datareader
OK: pyarrow
FS write check: OK


In [2]:
# Helpers
def _load_yaml(cfg_path: Path) -> dict:    
    if not cfg_path.exists():
        raise FileNotFoundError(f"Config not found: {cfg_path}")
    try:
        with cfg_path.open("r", encoding="utf-8") as f:
            return yaml.safe_load(f) or {}
    except Exception as e:
        raise RuntimeError(f"Failed to parse YAML: {cfg_path}") from e


def _require_keys(obj: Mapping, keys: Iterable[str], ctx: str) -> None:
    for k in keys:
        if k not in obj:
            raise KeyError(f"Missing key '{k}' in {ctx}")


def _enforce_guardrails(start: str, end: str,
                        expected_start: str = "2008-01-01",
                        expected_end: str = "2025-06-30") -> None:
    print("Config window:", start, "→", end)
    if not (start == expected_start and end == expected_end):
        raise ValueError(
            f"Guardrails breached: expected {expected_start} .. {expected_end}, got {start} .. {end}"
        )
    print(f"Guardrails OK ({expected_start} .. {expected_end})")


def _ensure_monthly_index(df: pd.DataFrame) -> pd.DataFrame:
    """Return `df` with sorted DatetimeIndex and verify month-end alignment.
    Args: df: Input DataFrame.
    Returns: Sorted DataFrame.
    Raises:
        TypeError: If index is not DatetimeIndex.
        ValueError: If index contains duplicates."""
    if not isinstance(df.index, pd.DatetimeIndex):
        raise TypeError("Index must be DatetimeIndex.")
    if df.index.duplicated().any():
        raise ValueError("Index contains duplicate timestamps; fix upstream merge.")
    if not df.index.is_monotonic_increasing:
        df = df.sort_index()
    # Soft check: month-end anchor.
    if not pd.Series(df.index.is_month_end).all():
        logger.warning("Index is not fully month-end aligned; verify upstream resampling.")
    return df


def _monthly_grid_gap_report(df: pd.DataFrame) -> pd.DataFrame:
    """Compute expected month-end grid and report missing months.
    Args: df: Monthly DataFrame.
    Returns:
        DataFrame with columns ['expected_count','actual_count','missing_count'] and
        attaches 'missing_list' in the .attrs for quick inspection."""
    start, end = df.index.min(), df.index.max()
    full_idx = pd.date_range(start=start, end=end, freq="M")  # calendar month-end stamps
    missing = full_idx.difference(df.index)
    report = pd.DataFrame(
        {
            "expected_count": [len(full_idx)],
            "actual_count": [len(df.index)],
            "missing_count": [len(missing)],
        }
    )
    report.attrs["missing_list"] = missing
    return report


def _plausibility_scan(df: pd.DataFrame, ranges: Mapping[str, tuple[float, float]]) -> list[str]:
    """Soft plausibility checks for key columns; returns list of warning strings.
    Args:
        df: DataFrame to scan.
        ranges: Mapping column -> (low, high) soft bounds.
    Returns: List of warnings (strings). No exceptions are raised."""
    warns: list[str] = []
    for c, (lo, hi) in ranges.items():
        if c not in df.columns:
            warns.append(f"INFO: Column not present (skip plausibility): {c}")
            continue
        s = df[c].dropna()
        if (s < lo).any():
            warns.append(f"{c}: values below {lo}")
        if (s > hi).any():
            warns.append(f"{c}: values above {hi}")
    return warns


def _audit_summary(df: pd.DataFrame) -> pd.DataFrame:
    """Create a compact audit table with NA rate and min/max per column.
    Args: df: Input DataFrame.
    Returns: DataFrame with columns ['col','na_rate','min','max']."""
    return pd.DataFrame(
        {
            "col": df.columns,
            "na_rate": [df[c].isna().mean() for c in df.columns],
            "min": [df[c].min() for c in df.columns],
            "max": [df[c].max() for c in df.columns],
        }
    )

In [3]:
# Config: Load config and enforce guardrails
cfg_path = ROOT / "config" / "data_config.yaml"
cfg = _load_yaml(cfg_path)
_require_keys(cfg, ["dataset"], "config/data_config.yaml")
_require_keys(cfg["dataset"], ["start_date", "end_date"], "config.dataset")

start, end = cfg["dataset"]["start_date"], cfg["dataset"]["end_date"]
_enforce_guardrails(start, end)

# Show dataset block
pp(cfg["dataset"])

Config window: 2008-01-01 → 2025-06-30
Guardrails OK (2008-01-01 .. 2025-06-30)
{'end_date': '2025-06-30',
 'equities': [{'agg': 'last',
               'name': 'SP500',
               'source': 'yahoo',
               'symbol': '^GSPC'}],
 'macro': [{'agg': 'last',
            'name': 'FedFundsRate',
            'series_id': 'FEDFUNDS',
            'source': 'fred'},
           {'agg': 'last',
            'name': 'CPI',
            'series_id': 'CPIAUCSL',
            'source': 'fred'},
           {'agg': 'last',
            'name': 'UnemploymentRate',
            'series_id': 'UNRATE',
            'source': 'fred'},
           {'agg': 'last',
            'name': 'VIX',
            'series_id': 'VIXCLS',
            'source': 'fred'},
           {'agg': 'mean',
            'name': 'EPU_US',
            'series_id': 'USEPUINDXM',
            'source': 'fred'},
           {'agg': 'mean',
            'name': 'FSI',
            'series_id': 'STLFSI4',
            'source': 'fred'},
       

In [4]:
# Load: Read the unified monthly parquet produced by the download step.
RAW = ROOT / "artifacts" / "data" / "raw_data.parquet"
if not RAW.exists():
    # Provide actionable next step instead of a generic FileNotFoundError.
    raise FileNotFoundError(f"Raw parquet not found: {RAW}. Run the data download step first.")
df = pd.read_parquet(RAW)
print("Shape:", df.shape)
print("Range:", str(df.index.min())[:10], "→", str(df.index.max())[:10])

df = _ensure_monthly_index(df)  # Sort, dedupe, and soft-check month-end alignment.

Shape: (210, 10)
Range: 2008-01-31 → 2025-06-30


In [5]:
# Integrity: Ensure a proper monthly grid (month-end)
grid = _monthly_grid_gap_report(df)
missing = grid.attrs["missing_list"]
print(
    f"Monthly index check: expected={int(grid['expected_count'][0])} | "
    f"actual={int(grid['actual_count'][0])} | missing={int(grid['missing_count'][0])}"
)
if len(missing) > 0:
    # Print the first 12.
    preview = list(missing[:12])
    print("Missing months (first 12):", preview, "..." if len(missing) > 12 else "")

# Quick NA visibility
na_rate = df.isna().mean().sort_values(ascending=False)
print("Top-10 missing rates:\n", na_rate.head(10))
print("Columns:", list(df.columns))
df.tail(3)  # Visual spot-check of month-end timestamps and last values.


Monthly index check: expected=210 | actual=210 | missing=0
Top-10 missing rates:
 EPU_US              0.014286
SP500               0.000000
FedFundsRate        0.000000
CPI                 0.000000
UnemploymentRate    0.000000
VIX                 0.000000
FSI                 0.000000
Gold_USD_oz         0.000000
WTI_Spot            0.000000
USD_per_EUR         0.000000
dtype: float64
Columns: ['SP500', 'FedFundsRate', 'CPI', 'UnemploymentRate', 'VIX', 'EPU_US', 'FSI', 'Gold_USD_oz', 'WTI_Spot', 'USD_per_EUR']


Unnamed: 0,SP500,FedFundsRate,CPI,UnemploymentRate,VIX,EPU_US,FSI,Gold_USD_oz,WTI_Spot,USD_per_EUR
2025-04-30,5569.060059,4.33,320.321,4.2,24.7,,0.045375,296.956668,63.54,1.1349
2025-05-31,5911.689941,4.33,320.58,4.2,18.57,,-0.5507,302.891427,62.17,1.1347
2025-06-30,6173.069824,4.33,321.5,4.1,16.73,,-0.73775,309.088421,68.17,1.177


In [6]:
# Plausibility: Soft plausibility ranges; trigger warnings to surface oddities but don't hard-stop.
PLAUSIBILITY_RANGES: dict[str, tuple[float, float]] = {
    "FedFundsRate": (-1.0, 25.0),       # percent
    "UnemploymentRate": (0.0, 25.0),    # percent
    "VIX": (0.0, 120.0),                # index level
    "USD_per_EUR": (0.5, 2.0),          # USD per EUR
    "FSI": (-10.0, 10.0),               # stress index
    "EPU_US": (0.0, 1000.0),            # uncertainty index
    "WTI_Spot": (0.0, 250.0),           # USD/barrel
    "Gold_USD_oz": (200.0, 3000.0),     # USD/oz
    "CPI": (50.0, 500.0),               # CPIAUCSL level (index)
    "SP500": (100.0, 10000.0),          # broad guard for index level
}

warnings_list = _plausibility_scan(df, PLAUSIBILITY_RANGES)
for w in warnings_list:
    print(("WARN: " if "values" in w else "") + w)
print("Plausibility scan done.")

WARN: Gold_USD_oz: values below 200.0
Plausibility scan done.


In [7]:
# Audit: Save compact audit summary for reporting/diffing in CI or later notebooks.
audit = _audit_summary(df)
out = ROOT / "artifacts" / "reports" / "audit_raw_summary.csv"
try:
    audit.to_csv(out, index=False)
    print("Saved audit summary →", out)
except Exception as e:
    # Provide path context and propagate full traceback for debugging.
    raise RuntimeError(f"Failed to write audit CSV to {out}: {e}") from e

audit.head(10)  # Quick peek; outputs are cleared on save/commit.

Saved audit summary → C:\Users\gamer\Desktop\AktienPrognose\artifacts\reports\audit_raw_summary.csv


Unnamed: 0,col,na_rate,min,max
0,SP500,0.0,735.090027,6173.069824
1,FedFundsRate,0.0,0.05,5.33
2,CPI,0.0,211.398,321.5
3,UnemploymentRate,0.0,3.4,14.8
4,VIX,0.0,9.51,59.89
5,EPU_US,0.014286,71.26214,350.45982
6,FSI,0.0,-0.8923,8.42854
7,Gold_USD_oz,0.0,74.598422,309.088421
8,WTI_Spot,0.0,16.55,133.88
9,USD_per_EUR,0.0,0.9783,1.5805


In [8]:
# Results:
from IPython.display import Markdown, display
display(Markdown("""
**Project Checklist**
- [x] Data loaded (`artifacts/data/raw_data.parquet`)
- [x] Raw audit (`01_data_raw_audit.ipynb`)
- [ ] Build features (`02_build_features_monthly.ipynb`)
- [ ] EDA (`03_eda.ipynb`)
- [ ] Baselines (`10_train_eval_baselines.ipynb`)
- [ ] Linear models (`20_train_eval_linear.ipynb`)
- [ ] ARIMA (`21_train_eval_arima.ipynb`)
- [ ] Random Forest (`22_train_eval_random_forest.ipynb`)
- [ ] LSTM (`23_train_eval_lstm.ipynb`)
- [ ] Ensembles (`90_ensembles.ipynb`)
- [ ] Final report (`99_report_reproducibility.ipynb`)
"""))


**Project Checklist**
- [x] Data loaded (`artifacts/data/raw_data.parquet`)
- [x] Raw audit (`01_data_raw_audit.ipynb`)
- [ ] Build features (`02_build_features_monthly.ipynb`)
- [ ] EDA (`03_eda.ipynb`)
- [ ] Baselines (`10_train_eval_baselines.ipynb`)
- [ ] Linear models (`20_train_eval_linear.ipynb`)
- [ ] ARIMA (`21_train_eval_arima.ipynb`)
- [ ] Random Forest (`22_train_eval_random_forest.ipynb`)
- [ ] LSTM (`23_train_eval_lstm.ipynb`)
- [ ] Ensembles (`90_ensembles.ipynb`)
- [ ] Final report (`99_report_reproducibility.ipynb`)
