# End-to-End Systematic Equity Strategy

This notebook implements a complete systematic equity strategy pipeline:

1. **Data Loading & Preprocessing** - S&P 500 prices + Fama-French 5 factors
2. **Alpha Estimation** - Rolling FF5 regression to estimate stock alphas
3. **Feature Engineering** - Technical indicators and factor exposures
4. **Alpha Prediction** - ML models (ElasticNetCV, XGBoost, RandomForest, TabPFN)
5. **Risk Model** - Shrinkage covariance estimation
6. **Portfolio Optimization** - Multiple objectives (Max Sharpe, Target Risk, Target Return, etc.)
7. **Backtesting** - Weekly rebalancing with transaction costs
8. **Performance Analysis** - Strategy comparison and recommendation

**Configuration:**
- Date range: 2000-01-01 to 2020-11-24
- Training lookback: 260 weeks (~5 years)
- Rebalancing: Weekly (Friday)
- Universe: S&P 500 constituents

In [5]:
# Install required packages if needed
%pip install tabpfn --upgrade
%pip install xgboost scikit-learn pandas numpy matplotlib seaborn scipy cvxpy
%pip install pyportfolioopt
%pip install pyarrow

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting pyarrow
  Downloading pyarrow-23.0.0-cp313-cp313-macosx_12_0_arm64.whl.metadata (3.0 kB)
Downloading pyarrow-23.0.0-cp313-cp313-macosx_12_0_arm64.whl (34.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.2/34.2 MB[0m [31m27.7 MB/s[0m  [33m0:00:01[0mm0:00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-23.0.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from dataclasses import dataclass, field
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# ML imports
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

# Portfolio Optimization - PyPortfolioOpt
try:
    from pypfopt import EfficientFrontier, risk_models, expected_returns
    from pypfopt import HRPOpt
    from pypfopt.objective_functions import L2_reg
    PYPFOPT_AVAILABLE = True
    print("PyPortfolioOpt loaded successfully")
except ImportError:
    PYPFOPT_AVAILABLE = False
    print("PyPortfolioOpt not available - install with: pip install pyportfolioopt")

# Optimization (fallback)
from scipy.optimize import minimize
from scipy import stats

# Plotting settings
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

# Set random seed for reproducibility
np.random.seed(42)

DATA_DIR = Path('.')

PyPortfolioOpt loaded successfully


In [19]:
# Configuration
@dataclass
class StrategyConfig:
    # Date range
    start_date: str = '2000-01-01'
    end_date: str = '2020-11-24'
    
    # Training
    training_lookback_weeks: int = 104  # ~2 years (rolling window)
    min_training_samples: int = 80  # Minimum weeks to start training
    
    # Alpha estimation
    ff_regression_window: int = 52  # 1 year for FF regression
    
    # Portfolio constraints
    max_weight: float = 0.02  # 2% max per stock
    sector_cap: float = 0.25  # 25% max per sector
    min_stocks: int = 30  # Minimum stocks in portfolio
    
    # Transaction costs
    cost_bps: float = 10.0  # 10 bps per unit turnover
    
    # Optimization targets
    target_risk_annual: float = 0.15  # 15% annual vol for target risk
    target_return_annual: float = 0.10  # 10% annual return for target return
    risk_aversion: float = 5.0  # For mean-variance
CONFIG = StrategyConfig()
print(f"Configuration loaded:")
print(f"  Date range: {CONFIG.start_date} to {CONFIG.end_date}")
print(f"  Training lookback: {CONFIG.training_lookback_weeks} weeks (~2 years)")
print(f"  Max position: {CONFIG.max_weight:.1%}")
print(f"  Sector cap: {CONFIG.sector_cap:.0%}")

Configuration loaded:
  Date range: 2000-01-01 to 2020-11-24
  Training lookback: 104 weeks (~2 years)
  Max position: 2.0%
  Sector cap: 25%


In [None]:
# Load S&P 500 price data
print("Loading S&P 500 price data...")
prices_df = pd.read_parquet(DATA_DIR / 'sp500_prices_with_metadata.parquet')
prices_df['date'] = pd.to_datetime(prices_df['date'])

print(f"Total records: {len(prices_df):,}")
print(f"Date range: {prices_df['date'].min().date()} to {prices_df['date'].max().date()}")
print(f"Unique tickers: {prices_df['ticker'].nunique()}")

# Load metadata
metadata_df = pd.read_csv(DATA_DIR / 'sp500_metadata.csv')
sector_map = metadata_df.set_index('ticker')['sector'].to_dict()
print(f"\nSectors: {metadata_df['sector'].nunique()}")

Loading S&P 500 price data...
Total records: 2,532,720
Date range: 2 to 1
Unique tickers: 504

Sectors: 11


In [15]:
# Load Fama-French 5 Factors
print("Loading Fama-French 5 Factors...")
ff_df = pd.read_csv(DATA_DIR / 'FamaFrench_5_Factors_daily_returns.csv')

# Parse date (format: YYYYMMDD)
ff_df['date'] = pd.to_datetime(ff_df['Date'].astype(str), format='%Y%m%d')
ff_df = ff_df.drop('Date', axis=1)

# IMPORTANT: Convert from percentage to decimal
factor_cols = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']
for col in factor_cols:
    ff_df[col] = ff_df[col] / 100.0

print(f"FF5 date range: {ff_df['date'].min().date()} to {ff_df['date'].max().date()}")
print(f"Records: {len(ff_df):,}")
print(f"\nSample (converted to decimals):")
ff_df.head(10)

Loading Fama-French 5 Factors...
FF5 date range: 1963-07-01 to 2022-06-30
Records: 14,852

Sample (converted to decimals):


Unnamed: 0,Mkt-RF,SMB,HML,RMW,CMA,RF,date
0,-0.0067,0.0001,-0.0035,0.0003,0.0011,0.00012,1963-07-01
1,0.0079,-0.0031,0.0024,-0.0008,-0.0025,0.00012,1963-07-02
2,0.0063,-0.0016,-0.0009,0.0013,-0.0024,0.00012,1963-07-03
3,0.004,0.0009,-0.0026,0.0007,-0.0028,0.00012,1963-07-05
4,-0.0063,0.0007,-0.0019,-0.0027,0.0006,0.00012,1963-07-08
5,0.0045,0.0001,0.0008,0.0015,0.0001,0.00012,1963-07-09
6,-0.0018,0.002,0.0001,0.0005,-0.0009,0.00012,1963-07-10
7,-0.0016,0.0012,-0.0029,-0.0003,0.0007,0.00012,1963-07-11
8,-0.0012,0.0001,-0.0008,0.0009,0.0007,0.00012,1963-07-12
9,-0.0062,0.0007,-0.0002,0.0015,-0.001,0.00012,1963-07-15


In [23]:
# Filter to date range and remove weekends/holidays
print("Filtering data to date range and trading days only...")

# Filter prices
prices_df = prices_df[
    (prices_df['date'] >= CONFIG.start_date) & 
    (prices_df['date'] <= CONFIG.end_date)
].copy()

# Filter FF factors
ff_df = ff_df[
    (ff_df['date'] >= CONFIG.start_date) & 
    (ff_df['date'] <= CONFIG.end_date)
].copy()

# Get trading days from FF data (excludes weekends and holidays)
trading_days = set(ff_df['date'])

# Filter prices to only trading days
prices_df = prices_df[prices_df['date'].isin(trading_days)].copy()

print(f"Trading days in period: {len(trading_days):,}")
print(f"Price records after filtering: {len(prices_df):,}")

# Verify no weekends
weekday_counts = prices_df['date'].dt.dayofweek.value_counts().sort_index()
print(f"\nDay of week distribution (0=Mon, 4=Fri):")
print(weekday_counts)

Filtering data to date range and trading days only...
Trading days in period: 5,259
Price records after filtering: 2,349,487

Day of week distribution (0=Mon, 4=Fri):
date
0    441881
1    481208
2    481474
3    473586
4    471338
Name: count, dtype: int64


In [7]:
# Create daily price panel
print("Creating price panel...")
price_panel = prices_df.pivot_table(
    index='date', 
    columns='ticker', 
    values='adj_close'
)

# Forward fill missing prices (max 5 days)
price_panel = price_panel.ffill(limit=5)

# Require at least 80% of data for a stock
min_obs = int(len(price_panel) * 0.8)
valid_stocks = price_panel.columns[price_panel.notna().sum() >= min_obs]
price_panel = price_panel[valid_stocks]

print(f"Price panel shape: {price_panel.shape}")
print(f"Stocks with sufficient data: {len(valid_stocks)}")

Creating price panel...
Price panel shape: (5259, 403)
Stocks with sufficient data: 403


In [8]:
# Resample to weekly (Friday close)
print("Resampling to weekly frequency (Friday close)...")

weekly_prices = price_panel.resample('W-FRI').last()
weekly_prices = weekly_prices.dropna(how='all')

# Compute weekly returns
weekly_returns = weekly_prices.pct_change().dropna(how='all')

# Also resample FF factors to weekly
ff_df = ff_df.set_index('date')

# For factors, compound daily returns to get weekly
def compound_returns(x):
    return (1 + x).prod() - 1

ff_weekly = ff_df.resample('W-FRI').apply(compound_returns)

print(f"Weekly prices shape: {weekly_prices.shape}")
print(f"Weekly returns shape: {weekly_returns.shape}")
print(f"Weekly FF factors shape: {ff_weekly.shape}")
print(f"\nDate range: {weekly_returns.index.min().date()} to {weekly_returns.index.max().date()}")

Resampling to weekly frequency (Friday close)...
Weekly prices shape: (1091, 403)
Weekly returns shape: (1090, 403)
Weekly FF factors shape: (1091, 6)

Date range: 2000-01-14 to 2020-11-27


In [9]:
# Align dates between returns and factors
common_dates = weekly_returns.index.intersection(ff_weekly.index)
weekly_returns = weekly_returns.loc[common_dates]
ff_weekly = ff_weekly.loc[common_dates]

print(f"Aligned weekly data: {len(common_dates)} weeks")
print(f"From {common_dates.min().date()} to {common_dates.max().date()}")

Aligned weekly data: 1090 weeks
From 2000-01-14 to 2020-11-27
