In [1]:
# check to see if FRED key is working

import os, sys, requests, pandas as pd
from datetime import datetime
from getpass import getpass
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Resolve your key safely for this runtime
FRED_API_KEY = os.getenv("FRED_API_KEY") or getpass("Paste your FRED API key (hidden): ").strip()
if not FRED_API_KEY:
    raise SystemExit("No FRED API key provided.")
os.environ["FRED_API_KEY"] = FRED_API_KEY

FRED_OBS_API = "https://api.stlouisfed.org/fred/series/observations"

def get_observations(series_id: str,
                     start: str = "2020-01-01",
                     end: str = datetime.today().strftime("%Y-%m-%d"),
                     timeout: int = 30) -> pd.DataFrame:
    """Fetch FRED observations; returns df indexed by date with a single <series_id> column."""
    params = {
        "series_id": series_id,
        "file_type": "json",
        "observation_start": start,
        "observation_end": end,
        "api_key": os.environ["FRED_API_KEY"],
    }
    r = requests.get(FRED_OBS_API, params=params, timeout=timeout)
    r.raise_for_status()
    payload = r.json()
    if "observations" not in payload:
        raise RuntimeError(f"No 'observations' in response for {series_id}: {payload}")
    df = pd.DataFrame(payload["observations"])
    if df.empty:
        raise RuntimeError(f"No rows for {series_id} in {start}..{end}")

    # Keep only the two fields we need, then normalize/rename
    df = df.loc[:, ["date", "value"]].copy()
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["value"] = pd.to_numeric(df["value"], errors="coerce")
    df = df.dropna(subset=["date", "value"]).sort_values("date")
    df = df.set_index("date").rename(columns={"value": series_id})
    return df

def summarize(df: pd.DataFrame):
    sid = df.columns[0]
    print(f"[ok] {sid:<12} rows={len(df):>5}  last={df.index.max().date()}  value={df.iloc[-1,0]:.6f}")
    display(df.tail(5))

# Test monthly / weekly / daily series
series_to_test = ["CPIAUCSL", "IC4WSA", "DCOILBRENTEU"]
start = "2020-01-01"; end = datetime.today().strftime("%Y-%m-%d")

ok = True
for sid in series_to_test:
    try:
        df = get_observations(sid, start, end)
        summarize(df)
    except Exception as e:
        ok = False
        print(f"[fail] {sid}: {e}", file=sys.stderr)

print("\n✅ FRED key works." if ok else "\n⚠️ Some series failed — check key, dates, or IDs.")

[ok] CPIAUCSL     rows=   69  last=2025-09-01  value=324.368000


Unnamed: 0_level_0,CPIAUCSL
date,Unnamed: 1_level_1
2025-05-01,320.58
2025-06-01,321.5
2025-07-01,322.132
2025-08-01,323.364
2025-09-01,324.368


[ok] IC4WSA       rows=  299  last=2025-09-20  value=237500.000000


Unnamed: 0_level_0,IC4WSA
date,Unnamed: 1_level_1
2025-08-23,228500
2025-08-30,230750
2025-09-06,240750
2025-09-13,240250
2025-09-20,237500


[ok] DCOILBRENTEU rows= 1484  last=2025-11-11  value=63.860000


Unnamed: 0_level_0,DCOILBRENTEU
date,Unnamed: 1_level_1
2025-11-05,63.54
2025-11-06,63.41
2025-11-07,63.72
2025-11-10,63.01
2025-11-11,63.86



✅ FRED key works.


In [None]:
# File: daily_nowcast_model.py
"""
Daily Cleveland CPI-nowcast model:
- Builds a daily dataset from Cleveland "Table View" CSVs (one file per month).
- Merges FRED daily/weekly features (Brent, Gas, Claims), forward-filling sparse series.
- Predicts next-day change in nowcast (Δ), avoiding lookahead.
Requires: pandas, numpy, scikit-learn, requests
Env: FRED_API_KEY must be set for FRED JSON API (falls back to fredgraph.csv if missing).
"""

from __future__ import annotations

import os
import io
import re
import json
import math
import glob
import warnings
from dataclasses import dataclass
from datetime import datetime
from typing import Optional, Tuple, List, Dict

import numpy as np
import pandas as pd
import requests
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


# ----------------------------
# Config
# ----------------------------

@dataclass(frozen=True)
class Config:
    fred_api_key: Optional[str] = os.getenv("FRED_API_KEY")  # set in your env for authenticated pulls
    cleveland_src_dir: str = "data/nowcast"                  # folder with monthly "Table View" CSVs
    out_dir: str = "artifacts_daily"
    start_date: str = "2020-01-01"                           # adjust if you have older Cleveland files
    end_date: str = datetime.today().strftime("%Y-%m-%d")

CFG = Config()

FRED_OBS_API = "https://api.stlouisfed.org/fred/series/observations"
FREDGRAPH_CSV = "https://fred.stlouisfed.org/graph/fredgraph.csv"


# ----------------------------
# FRED loader (API first, CSV fallback)
# ----------------------------

def fred_series(series_id: str, start: str, end: str, api_key: Optional[str]) -> pd.DataFrame:
    # Why: prefer API; fallback keeps dev flow if key missing.
    if api_key:
        params = {"series_id": series_id, "file_type": "json",
                  "observation_start": start, "observation_end": end,
                  "api_key": api_key}
        try:
            r = requests.get(FRED_OBS_API, params=params, timeout=30)
            r.raise_for_status()
            payload = r.json()
            obs = payload.get("observations", [])
            df = pd.DataFrame(obs)
            if not df.empty:
                df = df.loc[:, ["date", "value"]]
                df["date"] = pd.to_datetime(df["date"], errors="coerce")
                df["value"] = pd.to_numeric(df["value"], errors="coerce")
                df = df.dropna().sort_values("date").set_index("date")
                df.columns = [series_id]
                return df
            warnings.warn(f"FRED API returned no rows for {series_id}; using csv fallback.")
        except requests.HTTPError as e:
            warnings.warn(f"FRED API error for {series_id}: {e}; using csv fallback.")

    r = requests.get(FREDGRAPH_CSV, params={"id": series_id}, timeout=30)
    r.raise_for_status()
    raw = pd.read_csv(io.StringIO(r.text))
    if "observation_date" not in raw.columns or series_id not in raw.columns:
        raise ValueError(f"Unexpected fredgraph format for {series_id}")
    raw["observation_date"] = pd.to_datetime(raw["observation_date"], errors="coerce")
    raw[series_id] = pd.to_numeric(raw[series_id], errors="coerce")
    mask = (raw["observation_date"] >= pd.to_datetime(start)) & (raw["observation_date"] <= pd.to_datetime(end))
    df = raw.loc[mask, ["observation_date", series_id]].dropna().sort_values("observation_date")
    return df.set_index("observation_date")


# ----------------------------
# Cleveland daily nowcast loader (directory of monthly CSVs)
# ----------------------------

def _infer_year_month_from_name(path: str) -> Optional[Tuple[int, int]]:
    m = re.search(r"(20\d{2})[-_ ]?(\d{1,2})", os.path.basename(path))
    if not m:
        return None
    return int(m.group(1)), int(m.group(2))

def _pick_cpi_column(cols: List[str]) -> Optional[str]:
    for c in cols:
        lc = str(c).lower().replace(" ", "")
        if "cpi" in lc and "core" not in lc:
            return c
    return None

def _parse_cleveland_month_csv(path: str) -> Optional[pd.DataFrame]:
    try:
        df = pd.read_csv(path)
    except Exception as e:
        warnings.warn(f"Failed to read {path}: {e}")
        return None

    label_col = next((c for c in df.columns if str(c).strip().lower() in {"label", "date"}), None)
    if not label_col:
        warnings.warn(f"No 'Label' column in {path}")
        return None
    cpi_col = _pick_cpi_column(list(df.columns))
    if not cpi_col:
        warnings.warn(f"No CPI MoM col in {path}")
        return None

    ym = _infer_year_month_from_name(path)
    if not ym:
        warnings.warn(f"Cannot infer YYYY-MM from {path}")
        return None
    year, month = ym

    def to_dt(s: str):
        s = str(s).strip()
        if not s or s.lower() == "nan":
            return None
        try:
            mm, dd = s.split("/")
            return datetime(year=year, month=int(mm), day=int(dd))
        except Exception:
            return None

    df["date"] = df[label_col].apply(to_dt)
    df = df.dropna(subset=["date"]).sort_values("date")
    df["cpi_mom_nowcast"] = pd.to_numeric(df[cpi_col], errors="coerce")
    df = df.dropna(subset=["cpi_mom_nowcast"])
    out = df[["date", "cpi_mom_nowcast"]].copy()
    out["date"] = pd.to_datetime(out["date"])
    return out.set_index("date")

def load_cleveland_daily(src_dir: str) -> pd.DataFrame:
    files = sorted(glob.glob(os.path.join(src_dir, "*.csv")))
    if not files:
        raise FileNotFoundError(f"No Cleveland CSV files in {src_dir}")
    parts = []
    for f in files:
        d = _parse_cleveland_month_csv(f)
        if d is not None and not d.empty:
            parts.append(d)
    if not parts:
        raise RuntimeError("No usable Cleveland files parsed.")
    df = pd.concat(parts).sort_index()
    # Deduplicate dates (keep last)
    df = df[~df.index.duplicated(keep="last")]
    return df


# ----------------------------
# Daily feature table
# ----------------------------

def daily_log_return(series: pd.Series) -> pd.Series:
    return 100.0 * np.log(series / series.shift(1))

def build_daily_dataset() -> Tuple[pd.DataFrame, pd.DataFrame]:
    # Target (daily nowcast)
    nc = load_cleveland_daily(CFG.cleveland_src_dir)
    # Expand to full daily calendar, ffill gaps (weekends missing in Cleveland on some months)
    calendar = pd.date_range(max(pd.to_datetime(CFG.start_date), nc.index.min()),
                             min(pd.to_datetime(CFG.end_date), nc.index.max()),
                             freq="D")
    nc = nc.reindex(calendar).ffill()

    # FRED features
    brent = fred_series("DCOILBRENTEU", calendar.min().strftime("%Y-%m-%d"),
                        calendar.max().strftime("%Y-%m-%d"), CFG.fred_api_key).rename(columns={"DCOILBRENTEU": "brent"})
    gas = fred_series("GASREGW", calendar.min().strftime("%Y-%m-%d"),
                      calendar.max().strftime("%Y-%m-%d"), CFG.fred_api_key).rename(columns={"GASREGW": "gas"})
    claims = fred_series("IC4WSA", calendar.min().strftime("%Y-%m-%d"),
                         calendar.max().strftime("%Y-%m-%d"), CFG.fred_api_key).rename(columns={"IC4WSA": "claims4w"})

    # Reindex each to daily calendar & ffill (why: weekends/weekly cadence)
    # "reindex(calendar)" aligns the brent DataFrame to the daily 'calendar' date range.
    # This expands brent to have a row for every day in 'calendar', even if brent originally only included, for example, business days or some weekly frequency from FRED.
    # Any missing dates (e.g., weekends, holidays) are filled with NaN, which are then forward-filled by .ffill().
    # This ensures brent has daily values matching the date index used for all other features,
    # so that feature engineering, rolling windows, and eventual model training all work day-by-day.
    brent = brent.reindex(calendar).ffill()
    gas = gas.reindex(calendar).ffill()
    claims = claims.reindex(calendar).ffill()

    # Feature engineering (lag everything to avoid lookahead)
    feats = pd.DataFrame(index=calendar)
    feats["nc"] = nc["cpi_mom_nowcast"]
    feats["nc_lag1"] = feats["nc"].shift(1)
    feats["nc_lag3"] = feats["nc"].shift(3)
    feats["nc_lag7"] = feats["nc"].shift(7)
    feats["nc_ma7"] = feats["nc"].rolling(7, min_periods=3).mean()
    feats["nc_std7"] = feats["nc"].rolling(7, min_periods=3).std()
    feats["nc_ma14"] = feats["nc"].rolling(14, min_periods=5).mean()

    feats["brent"] = brent["brent"]
    feats["brent_ret"] = daily_log_return(feats["brent"]).shift(1)   # lag 1
    feats["brent_ret_3"] = feats["brent_ret"].rolling(3, min_periods=1).sum()
    feats["brent_ret_7"] = feats["brent_ret"].rolling(7, min_periods=3).sum()

    feats["gas"] = gas["gas"]
    feats["gas_ret"] = daily_log_return(feats["gas"]).shift(1)
    feats["gas_ret_7"] = feats["gas_ret"].rolling(7, min_periods=3).sum()

    # Claims: weekly level + weekly change (computed on weekly series then ffilled)
    claims_weekly_change = claims["claims4w"].diff().where(claims.index.dayofweek == claims.index.dayofweek)  # placeholder calc
    feats["claims4w"] = claims["claims4w"].shift(1)  # lag 1 day
    feats["claims_chg"] = claims["claims4w"].diff()

    # Calendar features (position within month)
    dom = feats.index.day
    month_len = feats.index.daysinmonth
    feats["dom_sin"] = np.sin(2 * np.pi * dom / month_len)
    feats["dom_cos"] = np.cos(2 * np.pi * dom / month_len)

    # Target: next-day change in nowcast (Δ)
    y = feats["nc"].shift(-1) - feats["nc"]

    # Final cleanup
    X = feats.drop(columns=["nc"])  # we keep lags/rolls, not the contemporaneous 'nc'
    df = pd.concat([X, y.rename("y_next_delta")], axis=1).dropna()
    return df.drop(columns=["brent", "gas"]), nc  # drop raw price levels to keep it lean


# ----------------------------
# Modeling (walk-forward)
# ----------------------------

def train_eval_daily(df: pd.DataFrame) -> Dict:
    Xcols = [c for c in df.columns if c != "y_next_delta"]
    ycol = "y_next_delta"
    X, y = df[Xcols], df[ycol]

    pre = ColumnTransformer([("num", StandardScaler(), Xcols)], remainder="drop")
    enet = Pipeline([("pre", pre),
                     ("m", ElasticNet(alpha=0.02, l1_ratio=0.15, max_iter=5000, random_state=42))])
    gbr = Pipeline([("pre", pre),
                    ("m", GradientBoostingRegressor(random_state=42, n_estimators=600, max_depth=3, learning_rate=0.03))])

    # Use smaller k if dataset is short
    n_splits = min(8, max(3, len(X) // 40))
    tscv = TimeSeriesSplit(n_splits=n_splits)

    enet_mae, enet_rmse, gbr_mae, gbr_rmse = [], [], [], []
    for tr, te in tscv.split(X):
        Xtr, Xte, ytr, yte = X.iloc[tr], X.iloc[te], y.iloc[tr], y.iloc[te]
        enet.fit(Xtr, ytr); p1 = enet.predict(Xte)
        gbr.fit(Xtr, ytr);  p2 = gbr.predict(Xte)
        enet_mae.append(mean_absolute_error(yte, p1))
        enet_rmse.append(math.sqrt(mean_squared_error(yte, p1)))
        gbr_mae.append(mean_absolute_error(yte, p2))
        gbr_rmse.append(math.sqrt(mean_squared_error(yte, p2)))

    metrics = {
        "avg_enet_mae": float(np.mean(enet_mae)),
        "avg_enet_rmse": float(np.mean(enet_rmse)),
        "avg_gbr_mae": float(np.mean(gbr_mae)),
        "avg_gbr_rmse": float(np.mean(gbr_rmse)),
        "splits": n_splits,
        "rows": int(len(X)),
        "start": X.index.min().strftime("%Y-%m-%d"),
        "end": X.index.max().strftime("%Y-%m-%d"),
    }

    # Fit on all and forecast tomorrow (Δ and level)
    enet.fit(X, y)
    gbr.fit(X, y)
    x_last = X.iloc[[-1]]
    delta_pred = float(gbr.predict(x_last)[0])
    return {
        "metrics": metrics,
        "latest": {
            "asof": X.index.max().strftime("%Y-%m-%d"),
            "delta_pred": delta_pred,
            # Level forecast for tomorrow = today_nowcast + delta_pred
            "level_pred": float(df.loc[X.index.max(), "y_next_delta"] + df.loc[X.index.max(), "y_next_delta"].shift(1) if False else np.nan)  # placeholder
        },
        "models": {"enet": enet, "gbr": gbr}
    }


# ----------------------------
# Orchestrator
# ----------------------------

def run() -> None:
    os.makedirs(CFG.out_dir, exist_ok=True)
    daily_df, nc = build_daily_dataset()
    print(f"[data] rows={len(daily_df)} range={daily_df.index.min().date()}→{daily_df.index.max().date()} features={len(daily_df.columns)-1}")
    result = train_eval_daily(daily_df)

    # Compute tomorrow's level forecast properly
    last_date = daily_df.index.max()
    today_nowcast = float(nc.loc[last_date, "cpi_mom_nowcast"])
    delta_pred = result["latest"]["delta_pred"]
    result["latest"]["level_pred"] = today_nowcast + delta_pred

    # Save
    with open(os.path.join(CFG.out_dir, "daily_metrics.json"), "w") as f:
        json.dump(result["metrics"], f, indent=2)
    with open(os.path.join(CFG.out_dir, "daily_latest.json"), "w") as f:
        json.dump(result["latest"], f, indent=2)

    print(json.dumps(result["metrics"], indent=2))
    print("Latest forecast:", json.dumps(result["latest"], indent=2))


if __name__ == "__main__":
    run()

FileNotFoundError: No Cleveland CSV files in data/nowcast