In [None]:
# Paths & environment smoke checks
from pathlib import Path
import sys, importlib, yaml

ROOT = Path.cwd()
assert (ROOT/"config").exists() and (ROOT/"src").exists(), "Project folders missing."
print("Project root:", ROOT)

# Ensure artifacts structure exists (clear separation of outputs)
for p in ["artifacts/data","artifacts/models","artifacts/forecasts","artifacts/metrics","artifacts/reports","artifacts/tmp"]:
    (ROOT/p).mkdir(parents=True, exist_ok=True)

# pandas implies numpy; we still print both for transparency/debugging
import pandas as pd, numpy as np
print("pandas", pd.__version__, "| numpy", np.__version__)

# Critical runtime deps (import-only smoke test)
for pkg in ["yfinance", "pandas_datareader", "pyarrow"]:
    try:
        importlib.import_module(pkg)
        print(f"OK: {pkg}")
    except Exception as e:
        raise RuntimeError(f"Missing/broken dependency: {pkg}. Install and retry.") from e

# Filesystem writability check (avoid later surprises on parquet writes)
tmp = ROOT/"artifacts/tmp/write_check.txt"
tmp.write_text("ok")
tmp.unlink()
print("FS write check: OK")

In [None]:
# Load config and enforce guardrails
cfg_path = ROOT/"config/data_config.yaml"
with open(cfg_path, "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f) or {}

if "dataset" not in cfg:
    raise KeyError("Missing top-level 'dataset' in config/data_config.yaml")
for k in ["start_date", "end_date"]:
    if k not in cfg["dataset"]:
        raise KeyError(f"Missing dataset.{k} in config")

start, end = cfg["dataset"]["start_date"], cfg["dataset"]["end_date"]
print("Config window:", start, "→", end)
assert start == "2008-01-01" and end == "2025-06-30", "Guardrails breached."

from pprint import pprint as pp
pp(cfg["dataset"])

In [None]:
import pandas as pd
RAW = ROOT/"artifacts/data/raw_data.parquet"
if not RAW.exists():
    raise FileNotFoundError(f"Raw parquet not found: {RAW}. Run 00_index first.")
df = pd.read_parquet(RAW)
print("Shape:", df.shape)
print("Range:", str(df.index.min())[:10], "→", str(df.index.max())[:10])
assert isinstance(df.index, pd.DatetimeIndex), "Index must be DatetimeIndex."

In [None]:
# Ensure a proper monthly grid (month-end)
full_idx = pd.date_range(start=df.index.min(), end=df.index.max(), freq="M")
missing = full_idx.difference(df.index)
print(f"Monthly index check: expected={len(full_idx)} | actual={len(df.index)} | missing={len(missing)}")
if len(missing) > 0:
    print("Missing months:", missing[:12], "..." if len(missing)>12 else "")

In [None]:
na_rate = df.isna().mean().sort_values(ascending=False)
print("Top-10 missing rates:\n", na_rate.head(10))
print("Columns:", list(df.columns))
df.tail(3)

In [None]:
# Soft plausibility ranges; trigger warnings to surface oddities but don't hard-stop
import math

def warn_if(cond, msg):
    if cond:
        print("WARN:", msg)

# Ranges chosen to catch gross data issues, not to overfit reality
colmap = {
    "FedFundsRate": (-1.0, 25.0),        # percent
    "UnemploymentRate": (0.0, 25.0),     # percent
    "VIX": (0.0, 120.0),                 # index level
    "USD_per_EUR": (0.5, 2.0),           # USD per EUR
    "FSI": (-10.0, 10.0),                # stress index
    "EPU_US": (0.0, 1000.0),             # uncertainty index
    "WTI_Spot": (0.0, 250.0),            # USD/barrel
    "Gold_USD_oz": (200.0, 3000.0),      # USD/oz
    "CPI": (50.0, 500.0),                # index level (CPIAUCSL)
    "SP500": (100.0, 10000.0),           # index level (broad guard)
}

for c, (lo, hi) in colmap.items():
    if c in df.columns:
        s = df[c].dropna()
        warn_if((s < lo).any(),  f"{c}: values below {lo}")
        warn_if((s > hi).any(),  f"{c}: values above {hi}")
    else:
        print(f"INFO: Column not present (skip plausibility): {c}")
print("Plausibility scan done.")

In [None]:
audit = pd.DataFrame({
    "col": df.columns,
    "na_rate": [df[c].isna().mean() for c in df.columns],
    "min": [df[c].min() for c in df.columns],
    "max": [df[c].max() for c in df.columns],
})
out = ROOT/"artifacts/reports/audit_raw_summary.csv"
audit.to_csv(out, index=False)
print("Saved audit summary →", out)
audit.head(10)