# ML Signal Generator - Training Pipeline

This notebook demonstrates the complete pipeline for:
1. Downloading market data
2. Engineering features
3. Training ML models
4. Generating trading signals
5. Backtesting performance

In [None]:
import sys
import os
from pathlib import Path
from dotenv import load_dotenv

# Get the project root directory (parent of notebooks directory)
current_dir = Path(os.getcwd())
if current_dir.name == 'notebooks':
    project_root = current_dir.parent
else:
    # If running from project root, notebooks should be in notebooks/ subdirectory
    project_root = current_dir

# Load environment variables from .env file
env_path = project_root / '.env'
if env_path.exists():
    load_dotenv(env_path)
    print(f"✓ Loaded environment variables from .env")
else:
    print(f"⚠️  No .env file found. Using defaults. Create .env from .env.example if needed.")

# Add src to path
src_path = project_root / 'src'
sys.path.insert(0, str(src_path))

# Set output directory
output_dir = project_root / 'outputs'
output_dir.mkdir(exist_ok=True)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display  # For better DataFrame display in Jupyter

from features import download_data, engineer_features, prepare_features_for_training
from model import time_series_split, train_random_forest, train_xgboost, get_feature_importance, evaluate_model
from backtest import generate_signals, backtest_strategy, plot_equity_curve, plot_feature_importance

print(f"Project root: {project_root}")
print(f"Output directory: {output_dir}")

✓ Loaded environment variables from .env
Project root: c:\Users\Admin\Documents\Projects\ml-signal-generator
Output directory: c:\Users\Admin\Documents\Projects\ml-signal-generator\outputs


## 1. Download Data

In [None]:
# Configuration
TICKER = 'SPY'  # S&P 500 ETF
START_DATE = '2020-01-01'
END_DATE = '2024-01-01'

# API Configuration (loads from .env file, with fallback to defaults)
# Options: 'yfinance' (default, free but rate limited), 'alpha_vantage' (free with API key)
# NOTE: If Alpha Vantage gives errors, you can override here: API_SOURCE = 'yfinance'
API_SOURCE = os.getenv('API_SOURCE', 'yfinance')  # Load from .env or use default

# Get API key from .env based on source
if API_SOURCE == 'alpha_vantage':
    API_KEY = os.getenv('ALPHA_VANTAGE_API_KEY')
else:
    API_KEY = None  # yfinance doesn't need an API key

# Display API configuration
print(f"API Source: {API_SOURCE}")
if API_KEY:
    print(f"API Key: {'*' * (len(API_KEY) - 4) + API_KEY[-4:]}")  # Show only last 4 chars
else:
    print("API Key: Not set (using yfinance or not required)")
    if API_SOURCE == 'alpha_vantage':
        print(f"⚠️  WARNING: {API_SOURCE} requires an API key but none was found!")
        print("   Either add API key to .env or set API_SOURCE='yfinance'")

# Quick fix: If Alpha Vantage fails, uncomment the line below to force yfinance:
# API_SOURCE = 'yfinance'

# Instructions for setting up .env:
# 1. Copy .env.example to .env: cp .env.example .env
# 2. Edit .env and add your API keys
# 3. Get free API keys:
#    - Alpha Vantage: https://www.alphavantage.co/support/#api-key

# OPTION: Set to True to skip download and use sample data (for testing when rate limited)
USE_SAMPLE_DATA = False  # Set to True if you're rate limited and want to continue

# Try to load from saved file first (checks both CSV and Excel)
data_file_csv = project_root / 'data' / f'{TICKER}_{START_DATE}_{END_DATE}.csv'
data_file_xlsx = project_root / 'data' / f'{TICKER}_{START_DATE}_{END_DATE}.xlsx'
data = None

# Check for CSV first, then Excel
if data_file_csv.exists():
    print(f"Loading saved data from {data_file_csv.name}...")
    try:
        data = pd.read_csv(data_file_csv, index_col=0, parse_dates=True)
        print(f"✓ Loaded {len(data)} days of saved data from CSV")
    except Exception as e:
        print(f"⚠️  Error loading CSV: {e}")
        data = None
elif data_file_xlsx.exists():
    print(f"Loading saved data from {data_file_xlsx.name}...")
    try:
        data = pd.read_excel(data_file_xlsx, index_col=0, parse_dates=True)
        print(f"✓ Loaded {len(data)} days of saved data from Excel")
    except Exception as e:
        print(f"⚠️  Error loading Excel: {e}")
        data = None

# If no saved data, try to download
if data is None or data.empty:
    print(f"\nDownloading data for {TICKER}...")
    import time
    
    max_retries = 3
    retry_delay = 30  # seconds - increased to 30 seconds between retries
    initial_wait = 60  # Wait 60 seconds before first attempt if recently rate limited
    
    for attempt in range(max_retries):
        # Wait before first attempt if this is a retry (not the first attempt)
        if attempt > 0:
            wait_time = retry_delay * attempt
            print(f"Waiting {wait_time} seconds before retry attempt {attempt + 1}/{max_retries}...")
            time.sleep(wait_time)
        
        try:
            data = download_data(TICKER, START_DATE, END_DATE, api_source=API_SOURCE, api_key=API_KEY)
            
            # Check if download was successful
            if data.empty or len(data) == 0:
                if attempt < max_retries - 1:
                    # Will wait at start of next loop iteration
                    print(f"Download returned empty data. Will retry... (Attempt {attempt + 1}/{max_retries})")
                    continue
                else:
                    raise ValueError(f"Failed to download data for {TICKER} after {max_retries} attempts.")
            
            # Save data for future use
            data_dir = project_root / 'data'
            data_dir.mkdir(exist_ok=True)
            data.to_csv(data_file)
            print(f"✓ Data saved to {data_file.name} for future use")
            break
            
        except Exception as e:
            error_msg = str(e).lower()
            error_type = type(e).__name__
            
            # Check for rate limit errors (both in message and exception type)
            is_rate_limit = (
                'rate limit' in error_msg or 
                'too many requests' in error_msg or
                'YFRateLimitError' in error_type or
                'ratelimit' in error_msg
            )
            
            if is_rate_limit:
                if attempt < max_retries - 1:
                    # Will wait at start of next loop iteration
                    print(f"Rate limit hit. Will retry with longer wait... (Attempt {attempt + 1}/{max_retries})")
                    continue
                else:
                    print(f"\n❌ Rate limited after {max_retries} attempts.")
                    print("\n" + "="*60)
                    print("YAHOO FINANCE RATE LIMIT - SOLUTIONS:")
                    print("="*60)
                    print("1. Wait 15-20 minutes, then run this cell again")
                    print("2. Try again later today (rate limits reset over time)")
                    print("3. Once data downloads successfully, it will be saved")
                    print("   and you won't need to download again")
                    print("="*60)
                    raise ValueError(f"Rate limited by Yahoo Finance. Please wait 15-20 minutes and try again.")
            else:
                # For other errors, raise immediately
                print(f"Error: {e}")
                raise

# If still no data and USE_SAMPLE_DATA is True, generate sample data
if (data is None or data.empty) and USE_SAMPLE_DATA:
    print("\n⚠️  Using sample data (USE_SAMPLE_DATA=True)")
    print("This is synthetic data for testing purposes only!")
    
    # Generate sample OHLC data
    dates = pd.date_range(start=START_DATE, end=END_DATE, freq='D')
    dates = dates[dates.weekday < 5]  # Only weekdays
    
    np.random.seed(42)
    n_days = len(dates)
    base_price = 400.0
    
    # Generate realistic price movements
    returns = np.random.normal(0.0005, 0.015, n_days)  # Daily returns
    prices = base_price * np.exp(np.cumsum(returns))
    
    # Generate OHLC from close prices
    data = pd.DataFrame({
        'Open': prices * (1 + np.random.normal(0, 0.002, n_days)),
        'High': prices * (1 + np.abs(np.random.normal(0, 0.005, n_days))),
        'Low': prices * (1 - np.abs(np.random.normal(0, 0.005, n_days))),
        'Close': prices,
        'Volume': np.random.randint(50000000, 200000000, n_days)
    }, index=dates)
    
    # Ensure High >= Close >= Low and High >= Open >= Low
    data['High'] = data[['Open', 'High', 'Close']].max(axis=1) * 1.001
    data['Low'] = data[['Open', 'Low', 'Close']].min(axis=1) * 0.999
    
    print(f"✓ Generated {len(data)} days of sample data")
    print(f"Date range: {data.index[0]} to {data.index[-1]}")
    print(f"\n⚠️  WARNING: This is synthetic data, not real market data!")
    print(f"   Set USE_SAMPLE_DATA=False and wait 15-20 minutes to download real data.")

# Display data info
if data is not None and not data.empty:
    print(f"\n✓ Successfully loaded {len(data)} days of data")
    print(f"Date range: {data.index[0]} to {data.index[-1]}")
    print(f"\nData preview:")
    display(data.head())  # Use display() to ensure it shows in Jupyter
else:
    print("\n" + "="*70)
    print("NO DATA AVAILABLE - OPTIONS:")
    print("="*70)
    print("1. Wait 15-20 minutes, then run this cell again")
    print("2. Set USE_SAMPLE_DATA = True at the top of this cell to use sample data")
    print("   (for testing purposes only - not real market data)")
    print("3. Try again later today (rate limits reset over time)")
    print("="*70)
    raise ValueError("No data available. Set USE_SAMPLE_DATA=True to continue with sample data, or wait and try downloading again.")

API Source: alpha_vantage
API Key: ************4O8F
Loading saved data from SPY_2020-01-01_2024-01-01.csv...
✓ Loaded 1006 days of saved data
Date range: 2020-01-02 00:00:00 to 2023-12-29 00:00:00

✓ Successfully loaded 1006 days of data
Date range: 2020-01-02 00:00:00 to 2023-12-29 00:00:00

Data preview:


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,323.54,324.89,322.53,324.87,59037072
2020-01-03,321.16,323.64,321.1,322.41,77708081
2020-01-06,320.49,323.73,320.36,323.64,55596982
2020-01-07,323.02,323.54,322.24,322.73,40461249
2020-01-08,322.94,325.78,322.67,324.45,68177241


## 2. Feature Engineering

In [13]:
# Engineer features
print("Engineering features...")
data_features = engineer_features(
    data,
    return_periods=[1, 5],
    volatility_window=20,
    ma_windows=[5, 20]
)

# Prepare features for training
X, y = prepare_features_for_training(data_features)
print(f"\nFeatures shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")
print(f"\nFeature columns: {list(X.columns)}")

Engineering features...

Features shape: (986, 6)
Target distribution:
target
1    528
0    458
Name: count, dtype: int64

Feature columns: ['return_1d', 'return_5d', 'volatility_20d', 'ma_5d', 'ma_20d', 'ma_gap']


## 3. Train-Test Split (Time Series Aware)

In [14]:
# Combine X and y for time series split
data_combined = pd.concat([X, y], axis=1)

# Split data
train, val, test = time_series_split(data_combined, train_size=0.7, val_size=0.15)

X_train = train[X.columns]
y_train = train['target']
X_val = val[X.columns]
y_val = val['target']
X_test = test[X.columns]
y_test = test['target']

print(f"Train set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set: {len(X_test)} samples")

Train set: 690 samples
Validation set: 148 samples
Test set: 148 samples


## 4. Train Model

In [15]:
# Choose model: 'random_forest' or 'xgboost'
MODEL_TYPE = 'random_forest'  # Change to 'xgboost' to use XGBoost

if MODEL_TYPE == 'random_forest':
    print("Training Random Forest...")
    model, metrics = train_random_forest(
        X_train, y_train,
        X_val, y_val,
        n_estimators=100,
        max_depth=10,
        random_state=42
    )
elif MODEL_TYPE == 'xgboost':
    print("Training XGBoost...")
    model, metrics = train_xgboost(
        X_train, y_train,
        X_val, y_val,
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    )

print(f"\nValidation Metrics:")
print(f"  AUC: {metrics['auc']:.4f}")
print(f"  Train Accuracy: {metrics['train_accuracy']:.4f}")
print(f"  Validation Accuracy: {metrics['val_accuracy']:.4f}")

Training Random Forest...

Validation Metrics:
  AUC: 0.5410
  Train Accuracy: 0.9681
  Validation Accuracy: 0.5000


## 5. Feature Importance

In [16]:
# Get feature importance
importance_df = get_feature_importance(model, list(X.columns))
print("Feature Importance:")
print(importance_df)

# Plot feature importance
# Ensure output_dir is defined (from Cell 1)
if 'output_dir' not in globals():
    from pathlib import Path
    output_dir = Path('outputs')
    output_dir.mkdir(exist_ok=True)

feature_importance_path = str(output_dir / 'feature_importance.png')
plot_feature_importance(importance_df, feature_importance_path)

Feature Importance:
          feature  importance
0       return_1d    0.189799
2  volatility_20d    0.170683
1       return_5d    0.169874
3           ma_5d    0.164021
5          ma_gap    0.159691
4          ma_20d    0.145932
Feature importance plot saved to c:\Users\Admin\Documents\Projects\ml-signal-generator\outputs\feature_importance.png


## 6. Evaluate on Test Set

In [17]:
# Evaluate model on test set
from sklearn.metrics import classification_report

test_results = evaluate_model(model, X_test, y_test)

print(f"\nTest Set Metrics:")
print(f"  AUC: {test_results['auc']:.4f}")
print(f"  Accuracy: {test_results['accuracy']:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, test_results['y_pred']))


Test Set Metrics:
  AUC: 0.5455
  Accuracy: 0.5608

Classification Report:
              precision    recall  f1-score   support

           0       0.46      0.21      0.29        63
           1       0.58      0.82      0.68        85

    accuracy                           0.56       148
   macro avg       0.52      0.51      0.48       148
weighted avg       0.53      0.56      0.51       148



## 7. Generate Trading Signals

In [18]:
# Generate signals on test set
SIGNAL_THRESHOLD = 0.55  # Probability threshold for signal generation

# Create signals as Series with X_test index for proper alignment
signals = pd.Series(
    generate_signals(test_results['y_pred_proba'], threshold=SIGNAL_THRESHOLD),
    index=X_test.index
)

# Get actual returns for backtesting
test_returns = data_features.loc[X_test.index, 'next_return'].dropna()

# Align signals with test_returns (remove rows where returns are NaN)
signals = signals.loc[test_returns.index]

print(f"Signal Statistics:")
print(f"  Total signals: {signals.sum()} out of {len(signals)} days")
print(f"  Signal rate: {signals.mean()*100:.2f}%")
print(f"\nTest returns range: {test_returns.index[0]} to {test_returns.index[-1]}")

Signal Statistics:
  Total signals: 104 out of 147 days
  Signal rate: 70.75%

Test returns range: 2023-05-31 00:00:00 to 2023-12-28 00:00:00


## 8. Backtest Strategy

In [19]:
# Align signals with returns (already aligned in previous cell)
aligned_data = pd.DataFrame({
    'signals': signals.values,
    'returns': test_returns.values
}, index=signals.index)

# Backtest
equity, metrics = backtest_strategy(
    aligned_data['signals'].values,
    aligned_data['returns'].values,
    initial_capital=10000.0
)

# Create equity series with dates
# Handle both Series and array returns from backtest_strategy
if isinstance(equity, pd.Series):
    equity_series = equity.reindex(aligned_data.index)
else:
    equity_series = pd.Series(equity, index=aligned_data.index)

print("\nBacktest Performance Metrics:")
print(f"  Total Return: {metrics['total_return_pct']:.2f}%")
print(f"  Annualized Return: {metrics['annualized_return_pct']:.2f}%")
print(f"  Volatility: {metrics['volatility_pct']:.2f}%")
print(f"  Sharpe Ratio: {metrics['sharpe_ratio']:.2f}")
print(f"  Max Drawdown: {metrics['max_drawdown_pct']:.2f}%")
print(f"  Win Rate: {metrics['win_rate_pct']:.2f}%")
print(f"  Total Trades: {metrics['total_trades']}")


Backtest Performance Metrics:
  Total Return: 13.82%
  Annualized Return: 24.85%
  Volatility: 9.58%
  Sharpe Ratio: 2.59
  Max Drawdown: -7.77%
  Win Rate: 61.17%
  Total Trades: 104


## 9. Plot Results

In [20]:
# Plot equity curve
# Ensure output_dir is defined (from Cell 1)
if 'output_dir' not in globals():
    from pathlib import Path
    output_dir = Path('outputs')
    output_dir.mkdir(exist_ok=True)

equity_curve_path = str(output_dir / 'equity_curve.png')
plot_equity_curve(equity_series, equity_curve_path, 'Strategy Equity Curve')

Equity curve saved to c:\Users\Admin\Documents\Projects\ml-signal-generator\outputs\equity_curve.png
