In [None]:
"""
Daily Cleveland CPI-nowcast model:
- Builds a daily dataset from Cleveland "Table View" CSVs (one file per month).
- Merges FRED daily/weekly features (Brent, Gas, Claims), forward-filling sparse series.
- Predicts next-day change in nowcast (Δ), avoiding lookahead.
Requires: pandas, numpy, scikit-learn, requests
Env: FRED_API_KEY must be set for FRED JSON API (falls back to fredgraph.csv if missing).
"""

from __future__ import annotations

import os
import io
import re
import json
import math
import glob
import warnings
from dataclasses import dataclass
from datetime import datetime
from typing import Optional, Tuple, List, Dict

import numpy as np
import pandas as pd
import requests
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


# ----------------------------
# Config
# ----------------------------

@dataclass(frozen=True)
class Config:
    fred_api_key: Optional[str] = os.getenv("FRED_API_KEY")  # set in your env for authenticated pulls
    cleveland_src_dir: str = "/Users/eddiekayizzi/Downloads/RealTimeQuant/backend/data"                  # folder with monthly "Table View" CSVs
    out_dir: str = "artifacts_daily"
    start_date: str = "2020-01-01"                           # adjust if you have older Cleveland files
    end_date: str = datetime.today().strftime("%Y-%m-%d")

CFG = Config()

FRED_OBS_API = "https://api.stlouisfed.org/fred/series/observations"
FREDGRAPH_CSV = "https://fred.stlouisfed.org/graph/fredgraph.csv"


# ----------------------------
# FRED loader (API first, CSV fallback)
# ----------------------------

def fred_series(series_id: str, start: str, end: str, api_key: Optional[str]) -> pd.DataFrame:
    # Why: prefer API; fallback keeps dev flow if key missing.
    if api_key:
        params = {"series_id": series_id, "file_type": "json",
                  "observation_start": start, "observation_end": end,
                  "api_key": api_key}
        try:
            r = requests.get(FRED_OBS_API, params=params, timeout=30)
            r.raise_for_status()
            payload = r.json()
            obs = payload.get("observations", [])
            df = pd.DataFrame(obs)
            if not df.empty:
                df = df.loc[:, ["date", "value"]]
                df["date"] = pd.to_datetime(df["date"], errors="coerce")
                df["value"] = pd.to_numeric(df["value"], errors="coerce")
                df = df.dropna().sort_values("date").set_index("date")
                df.columns = [series_id]
                return df
            warnings.warn(f"FRED API returned no rows for {series_id}; using csv fallback.")
        except requests.HTTPError as e:
            warnings.warn(f"FRED API error for {series_id}: {e}; using csv fallback.")

    r = requests.get(FREDGRAPH_CSV, params={"id": series_id}, timeout=30)
    r.raise_for_status()
    raw = pd.read_csv(io.StringIO(r.text))
    if "observation_date" not in raw.columns or series_id not in raw.columns:
        raise ValueError(f"Unexpected fredgraph format for {series_id}")
    raw["observation_date"] = pd.to_datetime(raw["observation_date"], errors="coerce")
    raw[series_id] = pd.to_numeric(raw[series_id], errors="coerce")
    mask = (raw["observation_date"] >= pd.to_datetime(start)) & (raw["observation_date"] <= pd.to_datetime(end))
    df = raw.loc[mask, ["observation_date", series_id]].dropna().sort_values("observation_date")
    return df.set_index("observation_date")


# ----------------------------
# Cleveland daily nowcast loader (directory of monthly CSVs)
# ----------------------------

def _infer_year_month_from_name(path: str) -> Optional[Tuple[int, int]]:
    m = re.search(r"(20\d{2})[-_ ]?(\d{1,2})", os.path.basename(path))
    if not m:
        return None
    return int(m.group(1)), int(m.group(2))

def _pick_cpi_column(cols: List[str]) -> Optional[str]:
    for c in cols:
        lc = str(c).lower().replace(" ", "")
        if "cpi" in lc and "core" not in lc:
            return c
    return None

def _parse_cleveland_month_csv(path: str) -> Optional[pd.DataFrame]:
    try:
        df = pd.read_csv(path)
    except Exception as e:
        warnings.warn(f"Failed to read {path}: {e}")
        return None

    label_col = next((c for c in df.columns if str(c).strip().lower() in {"label", "date"}), None)
    if not label_col:
        warnings.warn(f"No 'Label' column in {path}")
        return None
    cpi_col = _pick_cpi_column(list(df.columns))
    if not cpi_col:
        warnings.warn(f"No CPI MoM col in {path}")
        return None

    ym = _infer_year_month_from_name(path)
    if not ym:
        warnings.warn(f"Cannot infer YYYY-MM from {path}")
        return None
    year, month = ym

    def to_dt(s: str):
        s = str(s).strip()
        if not s or s.lower() == "nan":
            return None
        try:
            mm, dd = s.split("/")
            return datetime(year=year, month=int(mm), day=int(dd))
        except Exception:
            return None

    df["date"] = df[label_col].apply(to_dt)
    df = df.dropna(subset=["date"]).sort_values("date")
    df["cpi_mom_nowcast"] = pd.to_numeric(df[cpi_col], errors="coerce")
    df = df.dropna(subset=["cpi_mom_nowcast"])
    out = df[["date", "cpi_mom_nowcast"]].copy()
    out["date"] = pd.to_datetime(out["date"])
    return out.set_index("date")

def load_cleveland_daily(src_dir: str) -> pd.DataFrame:
    files = sorted(glob.glob(os.path.join(src_dir, "*.csv")))
    if not files:
        raise FileNotFoundError(f"No Cleveland CSV files in {src_dir}")
    parts = []
    for f in files:
        d = _parse_cleveland_month_csv(f)
        if d is not None and not d.empty:
            parts.append(d)
    if not parts:
        raise RuntimeError("No usable Cleveland files parsed.")
    df = pd.concat(parts).sort_index()
    # Deduplicate dates (keep last)
    df = df[~df.index.duplicated(keep="last")]
    return df


# ----------------------------
# Daily feature table
# ----------------------------

def daily_log_return(series: pd.Series) -> pd.Series:
    return 100.0 * np.log(series / series.shift(1))

def build_daily_dataset() -> Tuple[pd.DataFrame, pd.DataFrame]:
    # Target (daily nowcast)
    nc = load_cleveland_daily(CFG.cleveland_src_dir)
    # Expand to full daily calendar, ffill gaps (weekends missing in Cleveland on some months)
    calendar = pd.date_range(max(pd.to_datetime(CFG.start_date), nc.index.min()),
                             min(pd.to_datetime(CFG.end_date), nc.index.max()),
                             freq="D")
    nc = nc.reindex(calendar).ffill()

    # FRED features
    brent = fred_series("DCOILBRENTEU", calendar.min().strftime("%Y-%m-%d"),
                        calendar.max().strftime("%Y-%m-%d"), CFG.fred_api_key).rename(columns={"DCOILBRENTEU": "brent"})
    gas = fred_series("GASREGW", calendar.min().strftime("%Y-%m-%d"),
                      calendar.max().strftime("%Y-%m-%d"), CFG.fred_api_key).rename(columns={"GASREGW": "gas"})
    claims = fred_series("IC4WSA", calendar.min().strftime("%Y-%m-%d"),
                         calendar.max().strftime("%Y-%m-%d"), CFG.fred_api_key).rename(columns={"IC4WSA": "claims4w"})

    # Reindex each to daily calendar & ffill (why: weekends/weekly cadence)
    # "reindex(calendar)" aligns the brent DataFrame to the daily 'calendar' date range.
    # This expands brent to have a row for every day in 'calendar', even if brent originally only included, for example, business days or some weekly frequency from FRED.
    # Any missing dates (e.g., weekends, holidays) are filled with NaN, which are then forward-filled by .ffill().
    # This ensures brent has daily values matching the date index used for all other features,
    # so that feature engineering, rolling windows, and eventual model training all work day-by-day.
    brent = brent.reindex(calendar).ffill()
    gas = gas.reindex(calendar).ffill()
    claims = claims.reindex(calendar).ffill()

    # Feature engineering (lag everything to avoid lookahead)
    feats = pd.DataFrame(index=calendar)
    feats["nc"] = nc["cpi_mom_nowcast"]
    feats["nc_lag1"] = feats["nc"].shift(1)
    feats["nc_lag3"] = feats["nc"].shift(3)
    feats["nc_lag7"] = feats["nc"].shift(7)
    feats["nc_ma7"] = feats["nc"].rolling(7, min_periods=3).mean()
    feats["nc_std7"] = feats["nc"].rolling(7, min_periods=3).std()
    feats["nc_ma14"] = feats["nc"].rolling(14, min_periods=5).mean()

    feats["brent"] = brent["brent"]
    feats["brent_ret"] = daily_log_return(feats["brent"]).shift(1)   # lag 1
    feats["brent_ret_3"] = feats["brent_ret"].rolling(3, min_periods=1).sum()
    feats["brent_ret_7"] = feats["brent_ret"].rolling(7, min_periods=3).sum()

    feats["gas"] = gas["gas"]
    feats["gas_ret"] = daily_log_return(feats["gas"]).shift(1)
    feats["gas_ret_7"] = feats["gas_ret"].rolling(7, min_periods=3).sum()

    # Claims: weekly level + weekly change (computed on weekly series then ffilled)
    claims_weekly_change = claims["claims4w"].diff().where(claims.index.dayofweek == claims.index.dayofweek)  # placeholder calc
    feats["claims4w"] = claims["claims4w"].shift(1)  # lag 1 day
    feats["claims_chg"] = claims["claims4w"].diff()

    # Calendar features (position within month)
    dom = feats.index.day
    month_len = feats.index.daysinmonth
    feats["dom_sin"] = np.sin(2 * np.pi * dom / month_len)
    feats["dom_cos"] = np.cos(2 * np.pi * dom / month_len)

    # Target: next-day change in nowcast (Δ)
    y = feats["nc"].shift(-1) - feats["nc"]

    # Final cleanup
    X = feats.drop(columns=["nc"])  # we keep lags/rolls, not the contemporaneous 'nc'
    df = pd.concat([X, y.rename("y_next_delta")], axis=1).dropna()
    return df.drop(columns=["brent", "gas"]), nc  # drop raw price levels to keep it lean


# ----------------------------
# Modeling (walk-forward)
# ----------------------------

def train_eval_daily(df: pd.DataFrame) -> Dict:
    Xcols = [c for c in df.columns if c != "y_next_delta"]
    ycol = "y_next_delta"
    X, y = df[Xcols], df[ycol]

    pre = ColumnTransformer([("num", StandardScaler(), Xcols)], remainder="drop")
    enet = Pipeline([("pre", pre),
                     ("m", ElasticNet(alpha=0.02, l1_ratio=0.15, max_iter=5000, random_state=42))])
    gbr = Pipeline([("pre", pre),
                    ("m", GradientBoostingRegressor(random_state=42, n_estimators=600, max_depth=3, learning_rate=0.03))])

    # Use smaller k if dataset is short
    n_splits = min(8, max(3, len(X) // 40))
    tscv = TimeSeriesSplit(n_splits=n_splits)

    enet_mae, enet_rmse, gbr_mae, gbr_rmse = [], [], [], []
    for tr, te in tscv.split(X):
        Xtr, Xte, ytr, yte = X.iloc[tr], X.iloc[te], y.iloc[tr], y.iloc[te]
        enet.fit(Xtr, ytr); p1 = enet.predict(Xte)
        gbr.fit(Xtr, ytr);  p2 = gbr.predict(Xte)
        enet_mae.append(mean_absolute_error(yte, p1))
        enet_rmse.append(math.sqrt(mean_squared_error(yte, p1)))
        gbr_mae.append(mean_absolute_error(yte, p2))
        gbr_rmse.append(math.sqrt(mean_squared_error(yte, p2)))

    metrics = {
        "avg_enet_mae": float(np.mean(enet_mae)),
        "avg_enet_rmse": float(np.mean(enet_rmse)),
        "avg_gbr_mae": float(np.mean(gbr_mae)),
        "avg_gbr_rmse": float(np.mean(gbr_rmse)),
        "splits": n_splits,
        "rows": int(len(X)),
        "start": X.index.min().strftime("%Y-%m-%d"),
        "end": X.index.max().strftime("%Y-%m-%d"),
    }

    # Fit on all and forecast tomorrow (Δ and level)
    enet.fit(X, y)
    gbr.fit(X, y)
    x_last = X.iloc[[-1]]
    delta_pred = float(gbr.predict(x_last)[0])
    return {
        "metrics": metrics,
        "latest": {
            "asof": X.index.max().strftime("%Y-%m-%d"),
            "delta_pred": delta_pred,
            # Level forecast for tomorrow = today_nowcast + delta_pred
            "level_pred": float(df.loc[X.index.max(), "y_next_delta"] + df.loc[X.index.max(), "y_next_delta"].shift(1) if False else np.nan)  # placeholder
        },
        "models": {"enet": enet, "gbr": gbr}
    }




In [25]:

"""
Utilities to interpret your daily-nowcast model and forecast future days.

Drop this into the same environment where you ran the daily model.
Assumptions:
- You already built `daily_df` (features + y) and `nc` (Cleveland nowcast series).
- You already have `models` from training: {'enet': enet_pipeline, 'gbr': gbr_pipeline}.
"""

from __future__ import annotations
import numpy as np
import pandas as pd
from typing import Dict, Optional, Tuple
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math


# --- 2) Diagnostics -------------------------------------------------------

def model_diagnostics(daily_df: pd.DataFrame, preds_gbr: np.ndarray) -> Dict:
    """
    Compare model MAE/RMSE vs a naive baseline (predict Δ=0), and report drift.
    """
    y = daily_df["y_next_delta"].values
    naive = np.zeros_like(y)  # Δ=0 baseline
    return {
        "naive_mae": float(mean_absolute_error(y, naive)),
        "naive_rmse": float(math.sqrt(mean_squared_error(y, naive))),
        "model_mae": float(mean_absolute_error(y, preds_gbr)),
        "model_rmse": float(math.sqrt(mean_squared_error(y, preds_gbr))),
        "improvement_mae": float(mean_absolute_error(y, naive) - mean_absolute_error(y, preds_gbr)),
        "improvement_rmse": float(math.sqrt(mean_squared_error(y, naive)) - math.sqrt(mean_squared_error(y, preds_gbr))),
    }

def gbr_feature_importance(gbr_pipeline, feature_names: list[str]) -> pd.DataFrame:
    """
    Extract relative importances from GradientBoosting inside the pipeline.
    """
    gb = gbr_pipeline.named_steps["m"]
    importances = getattr(gb, "feature_importances_", None)
    if importances is None:
        return pd.DataFrame(columns=["feature", "importance"])
    imp = (
        pd.DataFrame({"feature": feature_names, "importance": importances})
        .sort_values("importance", ascending=False)
        .reset_index(drop=True)
    )
    return imp

# --- 3) Forecasts ---------------------------------------------------------

def predict_tomorrow(daily_df: pd.DataFrame,
                     nc: pd.DataFrame,
                     models: Dict[str, object]) -> Dict:
    """
    Predict tomorrow's Δ(nowcast) and level using the last available feature row.
    """
    Xcols = [c for c in daily_df.columns if c != "y_next_delta"]
    X = daily_df[Xcols]
    last_idx = X.index.max()
    x_last = X.loc[[last_idx]]
    delta_pred = float(models["gbr"].predict(x_last)[0])
    today_level = float(nc.loc[last_idx, "cpi_mom_nowcast"])
    return {
        "asof": str(last_idx.date()),
        "delta_pred": delta_pred,
        "level_today": today_level,
        "level_tomorrow_pred": today_level + delta_pred,
    }

def _apply_hold_last(exog_future: pd.DataFrame, last_row: pd.Series, cols: list[str]) -> pd.DataFrame:
    """
    When user doesn't provide exogenous future (brent_ret, gas_ret, claims...), hold last values.
    """
    out = exog_future.copy()
    for c in cols:
        if c not in out.columns:
            out[c] = last_row[c]
        out[c] = out[c].fillna(method="ffill").fillna(last_row[c])
    return out

def forecast_path(daily_df: pd.DataFrame,
                  nc: pd.DataFrame,
                  models: Dict[str, object],
                  days: int = 5,
                  exog_future: Optional[pd.DataFrame] = None) -> pd.DataFrame:
    """
    Multi-step simulation for the next `days` trading/calendar days.

    exog_future (optional): DataFrame indexed by future dates with any of:
      ['brent_ret','brent_ret_3','brent_ret_7','gas_ret','gas_ret_7','claims4w','claims_chg','dom_sin','dom_cos']
    Missing columns are forward-filled using the last observed values.
    """
    Xcols = [c for c in daily_df.columns if c != "y_next_delta"]
    hist = daily_df.copy()
    model = models["gbr"]

    # Seed with the last known day
    last_date = hist.index.max()
    future_idx = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=days, freq="D")

    # Build a template of future rows by repeating the last row
    last_row = hist.iloc[-1].copy()
    fut = pd.DataFrame([last_row.values] * days, columns=hist.columns, index=future_idx)

    # Recompute calendar features for future days
    fut["dom_sin"] = np.sin(2 * np.pi * fut.index.day / fut.index.daysinmonth)
    fut["dom_cos"] = np.cos(2 * np.pi * fut.index.day / fut.index.daysinmonth)

    # If the caller provided future exogenous paths, blend them in; else hold last values
    exog_cols = ["brent_ret","brent_ret_3","brent_ret_7","gas_ret","gas_ret_7","claims4w","claims_chg"]
    if exog_future is not None and not exog_future.empty:
        exog_future = exog_future.reindex(future_idx)
        fut[exog_cols] = _apply_hold_last(exog_future, last_row, exog_cols)[exog_cols]
    else:
        # nothing provided → keep last values
        pass

    # Simulate forward: update lags and moving averaging windows each step
    outputs = []
    current_nc = float(nc.loc[last_date, "cpi_mom_nowcast"])

    for dt in future_idx:
        # Use current history for lagged features
        # lags of 'nc'
        last_nc_series = pd.Series([np.nan,], index=[dt])  # placeholder to keep shape
        # recompute lag/rolling using the accumulating forecast path:
        # We need nc_lag1/3/7, ma7, std7, ma14 from the evolving nc series.
        # Build a temporary NC series combining historical nc with simulated levels so far.
        tmp_nc = nc["cpi_mom_nowcast"].copy()
        if outputs:
            sim_index = [o["date"] for o in outputs]
            sim_levels = [o["level"] for o in outputs]
            tmp_nc = pd.concat([tmp_nc, pd.Series(sim_levels, index=sim_index)])
        tmp_nc = tmp_nc.sort_index()

        # compute lags/rolls as of 'dt-1'
        lag1  = tmp_nc.shift(1).loc[dt]
        lag3  = tmp_nc.shift(3).loc[dt]
        lag7  = tmp_nc.shift(7).loc[dt]
        ma7   = tmp_nc.rolling(7, min_periods=3).mean().shift(0).loc[dt]
        std7  = tmp_nc.rolling(7, min_periods=3).std().shift(0).loc[dt]
        ma14  = tmp_nc.rolling(14, min_periods=5).mean().shift(0).loc[dt]

        fut.loc[dt, ["nc_lag1","nc_lag3","nc_lag7","nc_ma7","nc_std7","nc_ma14"]] = [lag1, lag3, lag7, ma7, std7, ma14]

        # Build X row and predict Δ
        x_row = fut.loc[[dt], Xcols]
        delta = float(model.predict(x_row)[0])
        current_nc = current_nc + delta  # level_t+1
        outputs.append({"date": dt, "delta": delta, "level": current_nc})

    out = pd.DataFrame(outputs).set_index("date")
    return out

# --- 4) Quick how-to ------------------------------------------------------

def quick_usage_example(daily_df, nc, models):
    """
    Example of using the helpers in a notebook:
    """
    # 1) Tomorrow
    tmr = predict_tomorrow(daily_df, nc, models)
    print("Tomorrow forecast:", tmr)

    # 2) 5-day hold-last simulation
    path = forecast_path(daily_df, nc, models, days=5)
    print(path)

In [26]:
# ----------------------------
# Orchestrator
# ----------------------------

def run() -> None:
    os.makedirs(CFG.out_dir, exist_ok=True)
    daily_df, nc = build_daily_dataset()
    print(f"[data] rows={len(daily_df)} range={daily_df.index.min().date()}→{daily_df.index.max().date()} features={len(daily_df.columns)-1}")
    result = train_eval_daily(daily_df)

    # Compute tomorrow's level forecast properly
    last_date = daily_df.index.max()
    today_nowcast = float(nc.loc[last_date, "cpi_mom_nowcast"])
    delta_pred = result["latest"]["delta_pred"]
    result["latest"]["level_pred"] = today_nowcast + delta_pred

    # Save
    with open(os.path.join(CFG.out_dir, "daily_metrics.json"), "w") as f:
        json.dump(result["metrics"], f, indent=2)
    with open(os.path.join(CFG.out_dir, "daily_latest.json"), "w") as f:
        json.dump(result["latest"], f, indent=2)

    print(json.dumps(result["metrics"], indent=2))
    print("Latest forecast:", json.dumps(result["latest"], indent=2))


if __name__ == "__main__":
    run()

[data] rows=670 range=2024-01-12→2025-11-11 features=15
{
  "avg_enet_mae": 0.024680838270393756,
  "avg_enet_rmse": 0.052469487369654066,
  "avg_gbr_mae": 0.044793303067085316,
  "avg_gbr_rmse": 0.07287466004187598,
  "splits": 8,
  "rows": 670,
  "start": "2024-01-12",
  "end": "2025-11-11"
}
Latest forecast: {
  "asof": "2025-11-11",
  "delta_pred": -0.06989717042166743,
  "level_pred": 0.22656075178820156
}


In [27]:
# Build data & train
daily_df, nc = build_daily_dataset()
res = train_eval_daily(daily_df)
models = res["models"]
metrics = res["metrics"]

print(f"[data] rows={len(daily_df)} range={daily_df.index.min().date()}→{daily_df.index.max().date()} features={len(daily_df.columns)-1}")
print("[metrics]", metrics)

# 2) Diagnostics — compare to naive Δ=0
Xcols = [c for c in daily_df.columns if c != "y_next_delta"]
preds_gbr_in_sample = models["gbr"].predict(daily_df[Xcols])
print(model_diagnostics(daily_df, preds_gbr_in_sample))

# 3) Feature importance (what moved the model)
imp = gbr_feature_importance(models["gbr"], Xcols)
display(imp.head(10))

# 4) Predict tomorrow
print(predict_tomorrow(daily_df, nc, models))



[data] rows=670 range=2024-01-12→2025-11-11 features=15
[metrics] {'avg_enet_mae': 0.024680838270393756, 'avg_enet_rmse': 0.052469487369654066, 'avg_gbr_mae': 0.044793303067085316, 'avg_gbr_rmse': 0.07287466004187598, 'splits': 8, 'rows': 670, 'start': '2024-01-12', 'end': '2025-11-11'}
{'naive_mae': 0.018029366872091654, 'naive_rmse': 0.05647548412204237, 'model_mae': 0.010915146389023833, 'model_rmse': 0.022089272362885604, 'improvement_mae': 0.007114220483067821, 'improvement_rmse': 0.034386211759156765}


Unnamed: 0,feature,importance
0,nc_std7,0.137062
1,nc_ma14,0.135722
2,nc_ma7,0.119212
3,brent_ret_7,0.116482
4,nc_lag1,0.098819
5,brent_ret_3,0.093956
6,gas_ret,0.071586
7,dom_sin,0.049698
8,dom_cos,0.040493
9,brent_ret,0.039494


{'asof': '2025-11-11', 'delta_pred': -0.06989717042166743, 'level_today': 0.296457922209869, 'level_tomorrow_pred': 0.22656075178820156}


In [28]:
def predict_nowcast(
    # Required: Current nowcast level and historical values
    current_nowcast: float,
    nowcast_lag1: float,      # Yesterday's nowcast
    nowcast_lag3: float,      # 3 days ago
    nowcast_lag7: float,       # 7 days ago
    nowcast_ma7: float,        # 7-day moving average
    nowcast_std7: float,       # 7-day standard deviation
    nowcast_ma14: float,      # 14-day moving average
    
    # Required: Economic indicators
    brent_ret: float,         # Brent oil daily return (%)
    brent_ret_3: float,       # 3-day sum of Brent returns
    brent_ret_7: float,       # 7-day sum of Brent returns
    gas_ret: float,           # Gas daily return (%)
    gas_ret_7: float,         # 7-day sum of gas returns
    claims4w: float,          # 4-week initial claims
    claims_chg: float,        # Change in claims
    
    # Required: Date for calendar features
    date: str,                # Format: "YYYY-MM-DD"
    
    # Optional: Trained model (if None, will use the last trained model)
    model=None,
    
    # Optional: Return format
    return_level: bool = True  # If True, returns level_tomorrow_pred, else just delta
) -> dict:
    """
    Predict tomorrow's CPI nowcast change and level.
    
    Args:
        current_nowcast: Today's Cleveland CPI MoM nowcast value
        nowcast_lag1/3/7: Historical nowcast values (1, 3, 7 days ago)
        nowcast_ma7/ma14: Moving averages of nowcast
        nowcast_std7: Standard deviation of nowcast over 7 days
        brent_ret: Brent oil daily return (%)
        brent_ret_3/7: Sum of Brent returns over 3/7 days
        gas_ret: Gas daily return (%)
        gas_ret_7: Sum of gas returns over 7 days
        claims4w: 4-week initial claims value
        claims_chg: Change in claims
        date: Date string in "YYYY-MM-DD" format
        model: Trained model (GradientBoostingRegressor pipeline). If None, uses global model.
        return_level: If True, returns predicted level for tomorrow, else just delta
    
    Returns:
        Dictionary with prediction results:
        {
            'asof': date,
            'delta_pred': predicted change in nowcast,
            'level_today': current nowcast level,
            'level_tomorrow_pred': predicted nowcast for tomorrow (if return_level=True)
        }
    """
    import pandas as pd
    from datetime import datetime
    
    # Use provided model or get from global scope
    if model is None:
        # Try to get from global scope (assumes model was trained in same session)
        if 'models' not in globals():
            raise ValueError("No model provided and no trained model found. Train model first or pass model parameter.")
        model = globals()['models']['gbr']
    
    # Calculate calendar features
    dt = pd.to_datetime(date)
    dom = dt.day
    month_len = dt.daysinmonth
    dom_sin = np.sin(2 * np.pi * dom / month_len)
    dom_cos = np.cos(2 * np.pi * dom / month_len)
    
    # Build feature vector in the same order as training
    features = {
        'nc_lag1': nowcast_lag1,
        'nc_lag3': nowcast_lag3,
        'nc_lag7': nowcast_lag7,
        'nc_ma7': nowcast_ma7,
        'nc_std7': nowcast_std7,
        'nc_ma14': nowcast_ma14,
        'brent_ret': brent_ret,
        'brent_ret_3': brent_ret_3,
        'brent_ret_7': brent_ret_7,
        'gas_ret': gas_ret,
        'gas_ret_7': gas_ret_7,
        'claims4w': claims4w,
        'claims_chg': claims_chg,
        'dom_sin': dom_sin,
        'dom_cos': dom_cos,
    }
    
    # Convert to DataFrame (model expects DataFrame format)
    feature_df = pd.DataFrame([features], index=[dt])
    
    # Make prediction
    delta_pred = float(model.predict(feature_df)[0])
    
    # Build result
    result = {
        'asof': date,
        'delta_pred': delta_pred,
        'level_today': current_nowcast,
    }
    
    if return_level:
        result['level_tomorrow_pred'] = current_nowcast + delta_pred
    
    return result


# Alternative: Simplified version that accepts a dictionary
def predict_nowcast_from_dict(features_dict: dict, model=None, return_level: bool = True) -> dict:
    """
    Simplified version that accepts all features as a dictionary.
    
    Args:
        features_dict: Dictionary with all required features:
            {
                'current_nowcast': float,
                'nowcast_lag1': float,
                'nowcast_lag3': float,
                'nowcast_lag7': float,
                'nowcast_ma7': float,
                'nowcast_std7': float,
                'nowcast_ma14': float,
                'brent_ret': float,
                'brent_ret_3': float,
                'brent_ret_7': float,
                'gas_ret': float,
                'gas_ret_7': float,
                'claims4w': float,
                'claims_chg': float,
                'date': 'YYYY-MM-DD'
            }
        model: Trained model (optional)
        return_level: If True, returns predicted level for tomorrow
    
    Returns:
        Dictionary with prediction results
    """
    return predict_nowcast(
        current_nowcast=features_dict['current_nowcast'],
        nowcast_lag1=features_dict['nowcast_lag1'],
        nowcast_lag3=features_dict['nowcast_lag3'],
        nowcast_lag7=features_dict['nowcast_lag7'],
        nowcast_ma7=features_dict['nowcast_ma7'],
        nowcast_std7=features_dict['nowcast_std7'],
        nowcast_ma14=features_dict['nowcast_ma14'],
        brent_ret=features_dict['brent_ret'],
        brent_ret_3=features_dict['brent_ret_3'],
        brent_ret_7=features_dict['brent_ret_7'],
        gas_ret=features_dict['gas_ret'],
        gas_ret_7=features_dict['gas_ret_7'],
        claims4w=features_dict['claims4w'],
        claims_chg=features_dict['claims_chg'],
        date=features_dict['date'],
        model=model,
        return_level=return_level
    )


# Helper function to extract features from the latest row of daily_df
def get_latest_features_for_prediction(daily_df: pd.DataFrame, nc: pd.DataFrame) -> dict:
    """
    Extract the latest feature values from daily_df to use for prediction.
    This is useful for getting current values to pass to predict_nowcast.
    
    Args:
        daily_df: The daily dataset DataFrame
        nc: The Cleveland nowcast DataFrame
    
    Returns:
        Dictionary with all features needed for prediction
    """
    last_date = daily_df.index.max()
    last_row = daily_df.iloc[-1]
    current_nowcast = float(nc.loc[last_date, 'cpi_mom_nowcast'])
    
    return {
        'current_nowcast': current_nowcast,
        'nowcast_lag1': float(last_row['nc_lag1']),
        'nowcast_lag3': float(last_row['nc_lag3']),
        'nowcast_lag7': float(last_row['nc_lag7']),
        'nowcast_ma7': float(last_row['nc_ma7']),
        'nowcast_std7': float(last_row['nc_std7']),
        'nowcast_ma14': float(last_row['nc_ma14']),
        'brent_ret': float(last_row['brent_ret']),
        'brent_ret_3': float(last_row['brent_ret_3']),
        'brent_ret_7': float(last_row['brent_ret_7']),
        'gas_ret': float(last_row['gas_ret']),
        'gas_ret_7': float(last_row['gas_ret_7']),
        'claims4w': float(last_row['claims4w']),
        'claims_chg': float(last_row['claims_chg']),
        'date': str(last_date.date())
    }

In [33]:
# Example 1: Using the function with dynamic values from your data
# Get the latest features from your actual dataset
latest_features = get_latest_features_for_prediction(daily_df, nc)

# Use those dynamic values for prediction
prediction = predict_nowcast(
    current_nowcast=latest_features['current_nowcast'],
    nowcast_lag1=latest_features['nowcast_lag1'],
    nowcast_lag3=latest_features['nowcast_lag3'],
    nowcast_lag7=latest_features['nowcast_lag7'],
    nowcast_ma7=latest_features['nowcast_ma7'],
    nowcast_std7=latest_features['nowcast_std7'],
    nowcast_ma14=latest_features['nowcast_ma14'],
    brent_ret=latest_features['brent_ret'],
    brent_ret_3=latest_features['brent_ret_3'],
    brent_ret_7=latest_features['brent_ret_7'],
    gas_ret=latest_features['gas_ret'],
    gas_ret_7=latest_features['gas_ret_7'],
    claims4w=latest_features['claims4w'],
    claims_chg=latest_features['claims_chg'],
    date=latest_features['date'],
    model=models['gbr']  # or None to use global model
)


# Example 2: Using dictionary input (easier for API/frontend) - with dynamic values
# Extract latest features from your actual data
latest_features = get_latest_features_for_prediction(daily_df, nc)

# Use the dictionary directly
prediction = predict_nowcast_from_dict(latest_features, model=models['gbr'])


# Example 3: Alternative - Extract values directly from DataFrames
# Get the last row from daily_df and corresponding nowcast value
last_date = daily_df.index.max()
last_row = daily_df.iloc[-1]
current_nowcast = float(nc.loc[last_date, 'cpi_mom_nowcast'])

# Build features dictionary directly from DataFrames
features = {
    'current_nowcast': current_nowcast,
    'nowcast_lag1': float(last_row['nc_lag1']),
    'nowcast_lag3': float(last_row['nc_lag3']),
    'nowcast_lag7': float(last_row['nc_lag7']),
    'nowcast_ma7': float(last_row['nc_ma7']),
    'nowcast_std7': float(last_row['nc_std7']),
    'nowcast_ma14': float(last_row['nc_ma14']),
    'brent_ret': float(last_row['brent_ret']),
    'brent_ret_3': float(last_row['brent_ret_3']),
    'brent_ret_7': float(last_row['brent_ret_7']),
    'gas_ret': float(last_row['gas_ret']),
    'gas_ret_7': float(last_row['gas_ret_7']),
    'claims4w': float(last_row['claims4w']),
    'claims_chg': float(last_row['claims_chg']),
    'date': str(last_date.date())
}

prediction = predict_nowcast_from_dict(features, model=models['gbr'])
print(prediction)

{'asof': '2025-11-11', 'delta_pred': -0.06989717042166743, 'level_today': 0.296457922209869, 'level_tomorrow_pred': 0.22656075178820156}
