In [2]:
# Machine Learning Finance Notebook
# Combining Cleveland CPI Nowcast data and FRED economic indicators

import os
import sys
import pandas as pd
import numpy as np
from datetime import datetime
from dotenv import load_dotenv
import warnings
warnings.filterwarnings('ignore')

# Load environment variables
load_dotenv()

print("üìä Setting up ML Finance notebook...")
print(f"Python version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"Numpy version: {np.__version__}")


üìä Setting up ML Finance notebook...
Python version: 3.13.1 (main, Dec  3 2024, 17:59:52) [Clang 16.0.0 (clang-1600.0.26.4)]
Pandas version: 2.3.3
Numpy version: 2.3.4


In [3]:
# Load Cleveland CPI Nowcast data
# Option 1: Load from saved CSV (if build_history has been run)
cleveland_csv_path = "/Users/eddiekayizzi/Downloads/RealTimeQuant/backend/data/cleveland_cpi_nowcast.csv"

try:
    cleveland_df = pd.read_csv(cleveland_csv_path)
    cleveland_df['month'] = pd.to_datetime(cleveland_df['month'])
    cleveland_df = cleveland_df.set_index('month')
    print(f"‚úÖ Loaded Cleveland data from CSV: {len(cleveland_df)} rows")
    print(f"Date range: {cleveland_df.index.min()} to {cleveland_df.index.max()}")
    display(cleveland_df.head())
except FileNotFoundError:
    print("‚ö†Ô∏è Cleveland CSV not found. Run cleveland_data.ipynb first to generate the data.")
    print("Or uncomment the code below to call build_history() directly:")
    # from cleveland_data import build_history
    # cleveland_df = build_history(
    #     src_dir="/Users/eddiekayizzi/Downloads/RealTimeQuant/backend/data",
    #     out_csv=cleveland_csv_path
    # )


‚úÖ Loaded Cleveland data from CSV: 11 rows
Date range: 2025-01-31 00:00:00 to 2025-11-30 00:00:00


Unnamed: 0_level_0,cpi_mom_nowcast
month,Unnamed: 1_level_1
2025-01-31,0.242425
2025-02-28,0.230487
2025-03-31,0.028148
2025-04-30,0.217749
2025-05-31,0.125107


In [4]:
# Load FRED economic indicators
# Import the function from fredApi.ipynb or define it here

import requests
from getpass import getpass

# Load FRED API key
FRED_API_KEY = os.getenv("FRED_API_KEY") or getpass("Paste your FRED API key (hidden): ").strip()
if not FRED_API_KEY:
    raise SystemExit("No FRED API key provided.")
os.environ["FRED_API_KEY"] = FRED_API_KEY

FRED_OBS_API = "https://api.stlouisfed.org/fred/series/observations"

def get_observations(series_id: str,
                     start: str = "2020-01-01",
                     end: str = datetime.today().strftime("%Y-%m-%d"),
                     timeout: int = 30) -> pd.DataFrame:
    """Fetch FRED observations; returns df indexed by date with a single <series_id> column."""
    params = {
        "series_id": series_id,
        "file_type": "json",
        "observation_start": start,
        "observation_end": end,
        "api_key": os.environ["FRED_API_KEY"],
    }
    r = requests.get(FRED_OBS_API, params=params, timeout=timeout)
    r.raise_for_status()
    payload = r.json()
    if "observations" not in payload:
        raise RuntimeError(f"No 'observations' in response for {series_id}: {payload}")
    df = pd.DataFrame(payload["observations"])
    if df.empty:
        raise RuntimeError(f"No rows for {series_id} in {start}..{end}")

    # Keep only the two fields we need, then normalize/rename
    df = df.loc[:, ["date", "value"]].copy()
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["value"] = pd.to_numeric(df["value"], errors="coerce")
    df = df.dropna(subset=["date", "value"]).sort_values("date")
    df = df.set_index("date").rename(columns={"value": series_id})
    return df

print("‚úÖ FRED API function loaded")


‚úÖ FRED API function loaded


In [6]:
# Fetch FRED data for economic indicators
# CPIAUCSL = Consumer Price Index
# IC4WSA = Initial Claims (weekly)
# DCOILBRENTEU = Brent Crude Oil Price (daily)

start_date = "2020-01-01"
end_date = datetime.today().strftime("%Y-%m-%d")

series_to_fetch = ["CPIAUCSL", "IC4WSA", "DCOILBRENTEU", "GASREGW"]
fred_dataframes = {}

print(f"Fetching FRED data from {start_date} to {end_date}...")
for series_id in series_to_fetch:
    try:
        df = get_observations(series_id, start_date, end_date)
        fred_dataframes[series_id] = df
        print(f"‚úÖ {series_id}: {len(df)} observations")
    except Exception as e:
        print(f"‚ùå {series_id}: {e}")

# Combine all FRED series into one DataFrame
if fred_dataframes:
    fred_df = pd.concat(fred_dataframes.values(), axis=1)
    print(f"\nüìà Combined FRED DataFrame: {fred_df.shape}")
    display(fred_df.head())
    display(fred_df.tail())
else:
    print("‚ö†Ô∏è No FRED data loaded")


Fetching FRED data from 2020-01-01 to 2025-11-15...
‚úÖ CPIAUCSL: 69 observations
‚úÖ IC4WSA: 299 observations
‚úÖ DCOILBRENTEU: 1484 observations
‚úÖ GASREGW: 306 observations

üìà Combined FRED DataFrame: (1824, 4)


Unnamed: 0_level_0,CPIAUCSL,IC4WSA,DCOILBRENTEU,GASREGW
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-01,259.127,,,
2020-01-02,,,67.05,
2020-01-03,,,69.08,
2020-01-04,,226750.0,,
2020-01-06,,,70.25,2.578


Unnamed: 0_level_0,CPIAUCSL,IC4WSA,DCOILBRENTEU,GASREGW
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-11-05,,,63.54,
2025-11-06,,,63.41,
2025-11-07,,,63.72,
2025-11-10,,,63.01,3.056
2025-11-11,,,63.86,


In [None]:
fred_df = fred_df.rename(columns={
    "DCOILBRENTEU": "brent",
    "CPIAUCSL": "cpi",
    "IC4WSA": "claims4w",
    "GASREGW": "gas"
})
fred_df = fred_df.sort_index()

fred_df

Unnamed: 0_level_0,cpi,claims,brent,gas
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-01,259.127,,,
2020-01-02,,,67.05,
2020-01-03,,,69.08,
2020-01-04,,226750.0,,
2020-01-06,,,70.25,2.578
...,...,...,...,...
2025-11-05,,,63.54,
2025-11-06,,,63.41,
2025-11-07,,,63.72,
2025-11-10,,,63.01,3.056


In [8]:
# Merge Cleveland Nowcast data with FRED economic indicators
# This creates a combined dataset for machine learning

try:
    # Ensure we have both datasets
    if 'cleveland_df' not in locals() or cleveland_df.empty:
        raise ValueError("Cleveland data not loaded")
    if 'fred_df' not in locals() or fred_df.empty:
        raise ValueError("FRED data not loaded")
    
    # Merge on date/index
    # Note: Cleveland data is monthly, FRED data has different frequencies
    # We'll resample/aggregate as needed
    
    # For monthly alignment, we can resample FRED daily/weekly data to monthly
    fred_monthly = fred_df.resample('M').last()  # Take last value of each month
    
    # Merge the datasets
    combined_df = pd.merge(
        cleveland_df,
        fred_monthly,
        left_index=True,
        right_index=True,
        how='inner'  # Only keep months where we have Cleveland data
    )
    
    print(f"‚úÖ Combined dataset created: {combined_df.shape}")
    print(f"Date range: {combined_df.index.min()} to {combined_df.index.max()}")
    print(f"\nColumns: {list(combined_df.columns)}")
    
    # Display summary statistics
    display(combined_df.describe())
    display(combined_df.head(10))
    
except Exception as e:
    print(f"‚ùå Error merging datasets: {e}")
    print("Make sure both Cleveland and FRED data are loaded first.")


‚úÖ Combined dataset created: (11, 5)
Date range: 2025-01-31 00:00:00 to 2025-11-30 00:00:00

Columns: ['cpi_mom_nowcast', 'cpi', 'claims', 'brent', 'gas']


Unnamed: 0,cpi_mom_nowcast,cpi,claims,brent,gas
count,11.0,9.0,9.0,11.0,11.0
mean,0.219808,321.193444,228277.777778,69.456364,3.120545
std,0.09477,1.797437,8761.104065,5.282337,0.042241
min,0.028148,319.086,213500.0,63.37,3.035
25%,0.171701,319.775,223000.0,64.88,3.1105
50%,0.230487,320.58,226000.0,68.15,3.125
75%,0.275015,322.132,235250.0,74.095,3.1535
max,0.375968,324.368,241250.0,77.23,3.164


Unnamed: 0,cpi_mom_nowcast,cpi,claims,brent,gas
2025-01-31,0.242425,319.086,213500.0,77.11,3.103
2025-02-28,0.230487,319.775,226000.0,74.76,3.125
2025-03-31,0.028148,319.615,223000.0,77.23,3.162
2025-04-30,0.217749,320.321,226000.0,63.37,3.133
2025-05-31,0.125107,320.58,235250.0,64.32,3.16
2025-06-30,0.253573,321.5,241250.0,68.15,3.164
2025-07-31,0.160147,322.132,221250.0,73.43,3.123
2025-08-31,0.304576,323.364,230750.0,67.83,3.147
2025-09-30,0.375968,324.368,237500.0,68.52,3.118
2025-10-31,0.183255,,,65.44,3.035


In [None]:
# Data preparation for Machine Learning
# This is where you'll add your ML models

print("üîß Data preparation for ML:")
print(f"Dataset shape: {combined_df.shape}")
print(f"Missing values per column:")
print(combined_df.isnull().sum())

# Example: Create features and target
# Target: Cleveland CPI MoM nowcast (cpi_mom_nowcast)
# Features: FRED economic indicators

if 'combined_df' in locals() and not combined_df.empty:
    # Separate target and features
    target = 'cpi_mom_nowcast'
    features = [col for col in combined_df.columns if col != target]
    
    X = combined_df[features]  # Features
    y = combined_df[target]     # Target variable
    
    print(f"\nüìä Target variable: {target}")
    print(f"üìä Feature variables: {features}")
    print(f"\nX shape: {X.shape}")
    print(f"y shape: {y.shape}")
    
    # Display correlation matrix to understand relationships
    print("\nüìà Correlation with target (cpi_mom_nowcast):")
    correlations = combined_df.corr()[target].sort_values(ascending=False)
    display(correlations)
    
    print("\n‚úÖ Data ready for machine learning!")
    print("üí° Next steps: Add your ML models below (e.g., regression, time series forecasting)")
else:
    print("‚ö†Ô∏è Combined dataset not available. Load data in previous cells first.")


In [None]:
# üöÄ Machine Learning Models Section
# Add your ML models here (e.g., Linear Regression, Random Forest, LSTM, etc.)

# Example: Simple Linear Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

if 'combined_df' in locals() and not combined_df.empty and 'X' in locals() and 'y' in locals():
    # Remove any rows with missing values
    mask = ~(X.isnull().any(axis=1) | y.isnull())
    X_clean = X[mask]
    y_clean = y[mask]
    
    if len(X_clean) > 0:
        # Split data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            X_clean, y_clean, test_size=0.2, random_state=42, shuffle=False
        )
        
        print(f"Training set: {X_train.shape[0]} samples")
        print(f"Test set: {X_test.shape[0]} samples")
        
        # Train a simple linear regression model
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        # Make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        # Evaluate model
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
        test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
        
        print(f"\nüìä Model Performance:")
        print(f"Train R¬≤: {train_r2:.4f}")
        print(f"Test R¬≤: {test_r2:.4f}")
        print(f"Train RMSE: {train_rmse:.4f}")
        print(f"Test RMSE: {test_rmse:.4f}")
        
        # Display feature importance (coefficients)
        print(f"\nüìà Feature Importance (Linear Regression Coefficients):")
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'coefficient': model.coef_
        }).sort_values('coefficient', key=abs, ascending=False)
        display(feature_importance)
        
        # Plot predictions vs actual
        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        plt.scatter(y_train, y_train_pred, alpha=0.6)
        plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
        plt.xlabel('Actual')
        plt.ylabel('Predicted')
        plt.title('Training Set: Predicted vs Actual')
        
        plt.subplot(1, 2, 2)
        plt.scatter(y_test, y_test_pred, alpha=0.6)
        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
        plt.xlabel('Actual')
        plt.ylabel('Predicted')
        plt.title('Test Set: Predicted vs Actual')
        plt.tight_layout()
        plt.show()
        
    else:
        print("‚ö†Ô∏è No clean data available for training (too many missing values)")
else:
    print("‚ö†Ô∏è X and y not defined. Run previous cells to prepare the data.")
