In [4]:
# File: scripts/build_cleveland_nowcast_history.py
"""
Build a monthly Cleveland CPI MoM expectation file from downloaded "Table View" CSVs.

Input:
  Directory of monthly CSVs you downloaded from:
    https://www.clevelandfed.org/indicators-and-data/inflation-nowcasting
  Example file names (any of these patterns are fine):
    data/nowcast/Month-Over-MonthPercentChange-2025-09.csv
    data/nowcast/MoM_2025-9.csv
    data/nowcast/cleveland_nowcast_2024-12.csv

Output:
  data/cleveland_cpi_nowcast.csv  with columns:
    month (month-end, YYYY-MM-DD), cpi_mom_nowcast (float, percent change)

Usage:
  python scripts/build_cleveland_nowcast_history.py --src data/nowcast --out data/cleveland_cpi_nowcast.csv
"""

from __future__ import annotations

import os
import re
import argparse
from pathlib import Path
from datetime import datetime
from typing import Optional, Tuple, List

import pandas as pd
import numpy as np

# --- helpers -------------------------------------------------------------

def _infer_year_month_from_name(file_path: str) -> Optional[Tuple[int, int]]:
    """
    Extract YYYY-MM or YYYY-M from filename.
    Returns (year, month) or None if not found.
    """
    name = os.path.basename(file_path)
    m = re.search(r"(20\d{2})[-_ ]?(\d{1,2})", name)
    if not m:
        return None
    year = int(m.group(1))
    month = int(m.group(2))
    return year, month

def _pick_cpi_column(columns: List[str]) -> Optional[str]:
    """
    Choose the non-core CPI MoM column (often 'CPI Inflation').
    """
    for c in columns:
        lc = c.lower().replace(" ", "")
        if "cpi" in lc and "core" not in lc:
            return c
    return None

def _parse_month_csv(path: str) -> Optional[pd.Series]:
    """
    Parse one monthly CSV (daily rows) and return a Series:
      index = [month_end_timestamp], value = cpi_mom_nowcast (float)
    """
    df = pd.read_csv(path)
    # Identify label/date column
    label_col = next((c for c in df.columns if str(c).strip().lower() in {"label", "date"}), None)
    if not label_col:
        print(f"[skip] {path}: no 'Label'/'Date' column")
        return None

    cpi_col = _pick_cpi_column(list(df.columns))
    if not cpi_col:
        print(f"[skip] {path}: could not find CPI MoM column")
        return None

    ym = _infer_year_month_from_name(path)
    if not ym:
        print(f"[skip] {path}: could not infer YYYY-MM from filename")
        return None
    year, month = ym

    def _to_dt(s: str) -> Optional[datetime]:
        s = str(s).strip()
        if not s or s.lower() == "nan":
            return None
        # Label looks like "09/02" or "11/12"
        try:
            mm, dd = s.split("/")
            return datetime(year=year, month=int(mm), day=int(dd))
        except Exception:
            return None

    df["date"] = df[label_col].apply(_to_dt)
    df = df.dropna(subset=["date"]).sort_values("date")

    # last non-NaN CPI MoM value for the month
    vals = pd.to_numeric(df[cpi_col], errors="coerce").dropna()
    if vals.empty:
        print(f"[skip] {path}: all CPI values NaN")
        return None

    last_val = float(vals.iloc[-1])
    month_end = pd.Timestamp(year=year, month=month, day=1).to_period("M").to_timestamp("M")
    return pd.Series([last_val], index=[month_end])

# --- main ---------------------------------------------------------------

def build_history(src_dir: str, out_csv: str) -> pd.DataFrame:
    p = Path(src_dir)
    files = sorted([str(f) for f in p.glob("*.csv")])
    if not files:
        raise SystemExit(f"No CSV files found in {src_dir}. Download monthly CSVs from the site first.")

    rows: List[pd.Series] = []
    for f in files:
        s = _parse_month_csv(f)
        if s is not None:
            rows.append(s)

    if not rows:
        raise SystemExit("No usable rows parsed. Check filenames and CSV structure.")

    hist = pd.concat(rows)
    hist = hist[~hist.index.duplicated(keep="last")].sort_index()
    out = pd.DataFrame({
        "month": hist.index,                # month-end datetime
        "cpi_mom_nowcast": hist.values,     # float %
    })
    out.to_csv(out_csv, index=False)
    print(f"[ok] wrote {len(out)} rows → {out_csv}")
    return out


# For Jupyter notebook usage, call build_history() directly in a new cell below

In [5]:
# Run the function with your paths here
# Change these paths to match your directories

src_directory = "/Users/eddiekayizzi/Downloads/RealTimeQuant/backend/data"
output_file = "/Users/eddiekayizzi/Downloads/RealTimeQuant/backend/data/cleveland_cpi_nowcast.csv"

# Execute the build
result_df = build_history(src_directory, output_file)

# Display the result
display(result_df)

[ok] wrote 11 rows → /Users/eddiekayizzi/Downloads/RealTimeQuant/backend/data/cleveland_cpi_nowcast.csv


Unnamed: 0,month,cpi_mom_nowcast
0,2025-01-31,0.242425
1,2025-02-28,0.230487
2,2025-03-31,0.028148
3,2025-04-30,0.217749
4,2025-05-31,0.125107
5,2025-06-30,0.253573
6,2025-07-31,0.160147
7,2025-08-31,0.304576
8,2025-09-30,0.375968
9,2025-10-31,0.183255
