In [4]:
# 03_ML_Ranking.ipynb
# ML Ranking Model

import sys, os
sys.path.append(os.path.abspath(".."))  # so we can import from src
import pandas as pd
from src.models import train_multioutput_rf, predict_rank_probabilities

# --- Load preprocessed data ---
features_scaled = pd.read_pickle("../data/processed/features_scaled.pkl")
target_aligned = pd.read_pickle("../data/processed/target.pkl")

# --- Convert categorical target to integer (if not already) ---
target_int = target_aligned.apply(lambda col: col.astype(int))

# --- Train-test split (time-based) ---
split_date = "2023-01-01"  # just a choice here (reasonable for the 2015-2025 window)
X_train = features_scaled[features_scaled.index < split_date]
X_test  = features_scaled[features_scaled.index >= split_date]

y_train = target_int[target_int.index < split_date]
y_test  = target_int[target_int.index >= split_date]

# --- Verify shapes ---
print("X_train.shape:", X_train.shape, "y_train.shape:", y_train.shape)
print("X_test.shape:", X_test.shape, "y_test.shape:", y_test.shape)

# --- Train multi-output RandomForest model ---
multi_clf = train_multioutput_rf(X_train, y_train, n_estimators=200, max_depth=5)

# --- use the trained model to predict top-quantile probabilities ---
pred_ranks_numeric = predict_rank_probabilities(multi_clf, X_test)

# --- Assign proper ticker column names ---
pred_ranks = pred_ranks_numeric.copy()
pred_ranks.columns = target_int.columns  # replace 0,1,2,... with actual tickers

print("Predicted rank probabilities shape:", pred_ranks.shape) # predicted probability that each stock will be in the top-performing quantile
pred_ranks.head()


X_train.shape: (89, 25) y_train.shape: (89, 5)
X_test.shape: (23, 25) y_test.shape: (24, 5)
Predicted rank probabilities shape: (23, 5)


Unnamed: 0_level_0,AAPL,MSFT,JPM,XOM,PG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-31,0.300172,0.233383,0.332454,0.477767,0.390224
2023-02-28,0.202669,0.186265,0.241581,0.281771,0.4214
2023-03-31,0.280127,0.291004,0.320349,0.265407,0.372184
2023-04-30,0.285628,0.3505,0.34852,0.340478,0.456608
2023-05-31,0.16178,0.304606,0.287076,0.324135,0.317805


In [5]:
# the model is trained. Warning: for the demo of project workflow, we don't dive into details of how to optimize the training itself here.
# Key point: Higher probability → model expects this stock to rank higher in the next month.
# In practice, when building a portfolio, you might pick the top N stocks with the highest predicted probabilities.
# These probabilities are not returns themselves, but signals for ranking and portfolio selection.
# Save predictions for use in backtest
os.makedirs("../data/processed", exist_ok=True)
pred_ranks.to_pickle("../data/processed/pred_ranks.pkl")