In [41]:
# 04_Backtest_Portfolio.ipynb

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

# --- Load predictions and realized data ---
pred_ranks = pd.read_pickle("../data/processed/pred_ranks.pkl")
target = pd.read_pickle("../data/processed/target.pkl")
prices = pd.read_pickle("../data/processed/prices.pkl") 

print("Predicted ranks shape:", pred_ranks.shape)
print("Target shape:", target.shape)
print("Prices shape:", prices.shape)

# inspect the data
print("\n--- Predicted ranks head ---")
print(pred_ranks.head())

print("\n--- Target head ---")
print(target.head())

print("\n--- Prices head ---")
print(prices.head())

Predicted ranks shape: (23, 5)
Target shape: (113, 5)
Prices shape: (2516, 5)

--- Predicted ranks head ---
                AAPL      MSFT       JPM       XOM        PG
Date                                                        
2023-01-31  0.300172  0.233383  0.332454  0.477767  0.390224
2023-02-28  0.202669  0.186265  0.241581  0.281771  0.421400
2023-03-31  0.280127  0.291004  0.320349  0.265407  0.372184
2023-04-30  0.285628  0.350500  0.348520  0.340478  0.456608
2023-05-31  0.161780  0.304606  0.287076  0.324135  0.317805

--- Target head ---
            AAPL  MSFT  JPM  XOM  PG
Date                                
2015-08-31     0     0    0    0   0
2015-09-30     1     1    1    1   1
2015-10-31     2     1    2    2   1
2015-11-30     1     2    2    1   2
2015-12-31     1     0    0    1   0

--- Prices head ---
                 AAPL       MSFT        JPM        XOM         PG
Date                                                             
2015-01-02  24.261049  39.933064

In [42]:
# --- Define a backtest strategy ---
# Simple backtest strate:
# Each period, pick top-k assets based on predicted rank.
# Compute equal-weighted average of their realized returns.
# --- Compute realized monthly returns ---
monthly_returns = prices.pct_change().resample('M').last()

# --- Align indices ---
common_idx = monthly_returns.index.intersection(pred_ranks.index)
pred_ranks = pred_ranks.loc[common_idx]
monthly_returns = monthly_returns.loc[common_idx]

print("Pred_ranks shape:", pred_ranks.shape)
print("Monthly_returns shape:", monthly_returns.shape)


Pred_ranks shape: (23, 5)
Monthly_returns shape: (23, 5)


In [43]:
import sys, os
sys.path.append(os.path.abspath(".."))
from src.backtest import backtest_topk
# backtest_topk returns a Series
# --- Run backtest ---
portfolio_returns = backtest_topk(pred_ranks, monthly_returns, k=2)
portfolio_returns_df = portfolio_returns.to_frame(name='ML_Portfolio')
portfolio_returns_df['Benchmark'] = benchmark_returns
print(portfolio_returns.head(10))
# --- Compute cumulative returns ---
cumulative = (1 + portfolio_returns).cumprod()  # compound rate (multiplicative)
cumulative = cumulative.to_frame(name="ML_Portfolio")  # convert Series to DataFrame with column name

Date
2023-01-31    0.015645
2023-02-28   -0.008572
2023-03-31    0.010263
2023-04-30    0.003698
2023-05-31   -0.011313
2023-06-30    0.009584
2023-07-31    0.004929
2023-08-31   -0.004778
2023-09-30   -0.016616
2023-10-31    0.008547
Freq: M, dtype: float64


In [44]:
# Create benchmark: equal-weighted portfolio
benchmark_returns = monthly_returns.mean(axis=1)
benchmark_cumulative = (1 + benchmark_returns).cumprod()
benchmark_cumulative = benchmark_cumulative.to_frame(name='Benchmark')
print(benchmark_returns.head(10))
# Combine ML and Benchmark for visualization
cumulative = pd.concat([cumulative, benchmark_cumulative], axis=1)

Date
2023-01-31    0.013458
2023-02-28   -0.003036
2023-03-31    0.010537
2023-04-30    0.007309
2023-05-31   -0.008831
2023-06-30    0.014891
2023-07-31    0.006297
2023-08-31   -0.001914
2023-09-30   -0.005348
2023-10-31    0.004400
Freq: M, dtype: float64


In [45]:
# simple ML without strong features usually won’t beat an equal-weight benchmark, especially in a short horizon.
# For demo purpose, we accecpt and move on to the next step.
# In reality, we could Extend backtest to full history with rolling training.
# -Add more features (macro, technical, cross-sectional factors).
# -Try different models (XGBoost, Neural Nets).
# -Compare Sharpe ratio, drawdowns, not just cumulative returns.
os.makedirs("../data/processed", exist_ok=True)

# Save monthly returns and cumulative performance
portfolio_returns_df.to_pickle("../data/processed/backtest_returns.pkl")
cumulative.to_pickle("../data/processed/backtest_cumulative.pkl")