# ML Signal Generator - Training Pipeline

This notebook demonstrates the complete pipeline for:
1. Downloading market data
2. Engineering features
3. Training ML models
4. Generating trading signals
5. Backtesting performance

In [1]:
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from features import download_data, engineer_features, prepare_features_for_training
from model import time_series_split, train_random_forest, train_xgboost, get_feature_importance, evaluate_model
from backtest import generate_signals, backtest_strategy, plot_equity_curve, plot_feature_importance

## 1. Download Data

In [2]:
# Configuration
TICKER = 'SPY'  # S&P 500 ETF
START_DATE = '2020-01-01'
END_DATE = '2024-01-01'

# Download OHLC data
print(f"Downloading data for {TICKER}...")
data = download_data(TICKER, START_DATE, END_DATE)
print(f"Downloaded {len(data)} days of data")
print(f"Date range: {data.index[0]} to {data.index[-1]}")
data.head()

Downloading data for SPY...


  data = yf.download(ticker, start=start_date, end=end_date, progress=False)


Downloaded 1006 days of data
Date range: 2020-01-02 00:00:00 to 2023-12-29 00:00:00


Price,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,298.578644,298.597043,296.428021,297.356291,59151200
2020-01-03,296.317749,297.448217,295.113768,295.16891,77709700
2020-01-06,297.448242,297.530955,294.433661,294.553145,55653900
2020-01-07,296.611908,297.356354,296.161544,296.878418,40496400
2020-01-08,298.192719,299.415071,296.556773,296.804912,68296000


## 2. Feature Engineering

In [3]:
# Engineer features
print("Engineering features...")
data_features = engineer_features(
    data,
    return_periods=[1, 5],
    volatility_window=20,
    ma_windows=[5, 20]
)

# Prepare features for training
X, y = prepare_features_for_training(data_features)
print(f"\nFeatures shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")
print(f"\nFeature columns: {list(X.columns)}")

Engineering features...

Features shape: (986, 6)
Target distribution:
target
1    529
0    457
Name: count, dtype: int64

Feature columns: ['return_1d', 'return_5d', 'volatility_20d', 'ma_5d', 'ma_20d', 'ma_gap']


## 3. Train-Test Split (Time Series Aware)

In [4]:
# Combine X and y for time series split
data_combined = pd.concat([X, y], axis=1)

# Split data
train, val, test = time_series_split(data_combined, train_size=0.7, val_size=0.15)

X_train = train[X.columns]
y_train = train['target']
X_val = val[X.columns]
y_val = val['target']
X_test = test[X.columns]
y_test = test['target']

print(f"Train set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set: {len(X_test)} samples")

Train set: 690 samples
Validation set: 148 samples
Test set: 148 samples


## 4. Train Model

In [5]:
# Choose model: 'random_forest' or 'xgboost'
MODEL_TYPE = 'random_forest'  # Change to 'xgboost' to use XGBoost

if MODEL_TYPE == 'random_forest':
    print("Training Random Forest...")
    model, metrics = train_random_forest(
        X_train, y_train,
        X_val, y_val,
        n_estimators=100,
        max_depth=10,
        random_state=42
    )
elif MODEL_TYPE == 'xgboost':
    print("Training XGBoost...")
    model, metrics = train_xgboost(
        X_train, y_train,
        X_val, y_val,
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    )

print(f"\nValidation Metrics:")
print(f"  AUC: {metrics['auc']:.4f}")
print(f"  Train Accuracy: {metrics['train_accuracy']:.4f}")
print(f"  Validation Accuracy: {metrics['val_accuracy']:.4f}")

Training Random Forest...

Validation Metrics:
  AUC: 0.5417
  Train Accuracy: 0.9725
  Validation Accuracy: 0.5473


## 5. Feature Importance

In [6]:
# Get feature importance
importance_df = get_feature_importance(model, list(X.columns))
print("Feature Importance:")
print(importance_df)

# Plot feature importance
plot_feature_importance(importance_df, 'outputs/feature_importance.png')

Feature Importance:
          feature  importance
0       return_1d    0.184821
2  volatility_20d    0.170873
1       return_5d    0.168280
4          ma_20d    0.160028
5          ma_gap    0.158587
3           ma_5d    0.157412
Feature importance plot saved to outputs/feature_importance.png


## 6. Evaluate on Test Set

In [7]:
# Evaluate model on test set
from sklearn.metrics import classification_report

test_results = evaluate_model(model, X_test, y_test)

print(f"\nTest Set Metrics:")
print(f"  AUC: {test_results['auc']:.4f}")
print(f"  Accuracy: {test_results['accuracy']:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, test_results['y_pred']))


Test Set Metrics:
  AUC: 0.4665
  Accuracy: 0.5135

Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.14      0.20        63
           1       0.55      0.79      0.65        85

    accuracy                           0.51       148
   macro avg       0.44      0.47      0.43       148
weighted avg       0.46      0.51      0.46       148



## 7. Generate Trading Signals

In [None]:
# Generate signals on test set
SIGNAL_THRESHOLD = 0.55  # Probability threshold for signal generation

# Create signals as Series with X_test index for proper alignment
signals = pd.Series(
    generate_signals(test_results['y_pred_proba'], threshold=SIGNAL_THRESHOLD),
    index=X_test.index
)

# Get actual returns for backtesting
test_returns = data_features.loc[X_test.index, 'next_return'].dropna()

# Align signals with test_returns (remove rows where returns are NaN)
signals = signals.loc[test_returns.index]

print(f"Signal Statistics:")
print(f"  Total signals: {signals.sum()} out of {len(signals)} days")
print(f"  Signal rate: {signals.mean()*100:.2f}%")
print(f"\nTest returns range: {test_returns.index[0]} to {test_returns.index[-1]}")

Signal Statistics:
  Total signals: 99 out of 148 days
  Signal rate: 66.89%

Test returns range: 2023-05-31 00:00:00 to 2023-12-28 00:00:00


## 8. Backtest Strategy

In [None]:
# Align signals with returns (already aligned in previous cell)
aligned_data = pd.DataFrame({
    'signals': signals.values,
    'returns': test_returns.values
}, index=signals.index)

# Backtest
equity, metrics = backtest_strategy(
    aligned_data['signals'].values,
    aligned_data['returns'].values,
    initial_capital=10000.0
)

# Create equity series with dates
# Handle both Series and array returns from backtest_strategy
if isinstance(equity, pd.Series):
    equity_series = equity.reindex(aligned_data.index)
else:
    equity_series = pd.Series(equity, index=aligned_data.index)

print("\nBacktest Performance Metrics:")
print(f"  Total Return: {metrics['total_return_pct']:.2f}%")
print(f"  Annualized Return: {metrics['annualized_return_pct']:.2f}%")
print(f"  Volatility: {metrics['volatility_pct']:.2f}%")
print(f"  Sharpe Ratio: {metrics['sharpe_ratio']:.2f}")
print(f"  Max Drawdown: {metrics['max_drawdown_pct']:.2f}%")
print(f"  Win Rate: {metrics['win_rate_pct']:.2f}%")
print(f"  Total Trades: {metrics['total_trades']}")

ValueError: array length 148 does not match index length 147

## 9. Plot Results

In [None]:
# Plot equity curve
plot_equity_curve(equity_series, 'outputs/equity_curve.png', 'Strategy Equity Curve')