In [33]:
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
import pandas as pd

In [34]:
df = pd.read_csv("../data/engineered_interleaved_features_multi_stock_data.csv",index_col="Date")
df.drop("Unnamed: 0",axis="columns",inplace=True)
df.head()

Unnamed: 0_level_0,Close,Volume,Target,RAV,volatility,Buy_Sell_Strength,Weighted_Strength,Trend,Returns,Log_returns,...,volume_rank,return_rank,lag_market_return,market_std,zscore_vs_market,return_1,return_2,vol_3,close_z,market_mean_return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-02-06,26.495502,174826400,1,250822400.0,1.046603,0.266668,-0.162635,-0.007055,-0.008421,8.064542,...,0.75,0.5,,0.006663,-0.13142,,,,,-0.007546
2015-02-06,36.218147,34616600,0,50972100.0,2.133879,0.406248,-0.06367,0.022419,-0.000943,10.087206,...,0.25,1.0,-0.007546,0.006663,0.991042,-0.008421,,,,-0.007546
2015-02-06,0.489517,210524000,0,215585700.0,0.010619,0.225,-0.268543,0.023777,-0.004392,-0.689525,...,1.0,0.75,-0.007546,0.006663,0.473265,-0.000943,-0.008421,1.0637,,-0.007546
2015-02-06,14.490667,48658500,1,54862930.0,0.608197,0.124639,-0.332912,0.016009,-0.016426,5.38673,...,0.5,0.25,-0.007546,0.006663,-1.332886,-0.004392,-0.000943,0.917565,,-0.007546
2015-02-09,26.671505,155559200,1,224795800.0,1.096023,0.914897,0.28711,0.010177,0.006643,8.139063,...,0.75,1.0,,0.003672,1.466347,-0.016426,-0.004392,0.571613,,0.001259


In [35]:
from sklearn.preprocessing import StandardScaler

X = df.drop("Target", axis="columns")
X_std = X.copy()
scaler = StandardScaler()
num_features = ['Close', 'Volume', 'RAV', 'volatility', 'Buy_Sell_Strength', 'Weighted_Strength', 'Trend',  'Returns', 'Log_returns','mean_return_others','divergence', 'volume_rank', 'return_rank','market_std', 'zscore_vs_market']
tickers = ['AAPL', 'MSFT', 'NVDA', 'TSLA']
X_std[num_features] = X_std[num_features].astype(float)
for ticker in tickers:
    mask = X[ticker] == 1
    X_std.loc[mask, num_features] = scaler.fit_transform(X.loc[mask, num_features])


y = df.Target

In [36]:
from sklearn.metrics import classification_report
def data_split(X,y,model):
    tscv = TimeSeriesSplit(n_splits=5)
    
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
    
    print(classification_report(y_test,preds))
    
    print(f'Train: {model.score(X_train,y_train)}')
    print(f'Test: {model.score(X_test,y_test)}')
    
    scores = cross_val_score(model, X, y, cv=tscv, scoring='accuracy')
    print("Scores:", scores)
    print("Mean:", scores.mean())

    return X_test, y_test

In [37]:
xgb = XGBClassifier(
    n_estimators=100,     
    learning_rate=0.05,    
    max_depth=2,         
    subsample=1.0,        
    colsample_bytree=0.8,
    gamma=0,              
    reg_alpha=1.4,          
    reg_lambda=0.5,         
    random_state=42
)
data_split(X,y,xgb)

              precision    recall  f1-score   support

           0       0.48      0.14      0.21       831
           1       0.54      0.87      0.67       963

    accuracy                           0.53      1794
   macro avg       0.51      0.50      0.44      1794
weighted avg       0.51      0.53      0.46      1794

Train: 0.563768115942029
Test: 0.5317725752508361
Scores: [0.52842809 0.53177258 0.52619844 0.5206243  0.53177258]
Mean: 0.5277591973244147


(                 Close     Volume           RAV  volatility  \
 Date                                                          
 2024-01-04   47.973946  306535000  3.074876e+08    1.179219   
 2024-01-04  237.929993  102629300  1.051806e+08    7.480690   
 2024-01-05  179.658951   62379700  5.715374e+07    5.113708   
 2024-01-05  363.025604   21004600  1.974449e+07    2.724859   
 2024-01-05   49.072388  415039000  3.318931e+08    0.986367   
 ...                ...        ...           ...         ...   
 2025-10-16  428.750000   77189900  7.908717e+07   10.409824   
 2025-10-17  252.289993   48839918  4.235036e+07    3.932802   
 2025-10-17  513.580017   19205931  1.727230e+07    5.672357   
 2025-10-17  183.220001  170079787  1.964499e+08    4.430583   
 2025-10-17  439.309998   87848438  8.146665e+07   10.200710   
 
             Buy_Sell_Strength  Weighted_Strength     Trend   Returns  \
 Date                                                                   
 2024-01-04         

In [38]:
xgb_std = XGBClassifier(
    n_estimators=100,     
    learning_rate=0.05,    
    max_depth=2,         
    subsample=1.0,        
    colsample_bytree=0.8,
    gamma=0,              
    reg_alpha=1.4,          
    reg_lambda=0.5,         
    random_state=42
)
data_split(X_std,y,xgb_std)

              precision    recall  f1-score   support

           0       0.48      0.44      0.46       831
           1       0.55      0.58      0.56       963

    accuracy                           0.52      1794
   macro avg       0.51      0.51      0.51      1794
weighted avg       0.52      0.52      0.52      1794

Train: 0.5693422519509476
Test: 0.5178372352285395
Scores: [0.50557414 0.53232999 0.53065775 0.51114827 0.51783724]
Mean: 0.5195094760312151


(               Close    Volume       RAV  volatility  Buy_Sell_Strength  \
 Date                                                                      
 2024-01-04  0.388764 -0.575734 -0.736575   -0.137591          -0.147154   
 2024-01-04  0.841947 -0.129494 -0.113947   -0.122589          -1.555338   
 2024-01-05  1.059671 -0.737088 -0.974697    0.767476          -0.460527   
 2024-01-05  1.155130 -0.561301 -0.883081   -0.538584          -1.043880   
 2024-01-05  0.412784 -0.147318 -0.612487   -0.219023           0.318787   
 ...              ...       ...       ...         ...                ...   
 2025-10-16  2.369591 -0.482123 -0.553527    0.169567          -0.344939   
 2025-10-17  2.077000 -0.940870 -1.239838    0.319771           0.935279   
 2025-10-17  2.238852 -0.701556 -1.140122    0.157414           0.736392   
 2025-10-17  3.346303 -1.114512 -1.301137    1.235299           0.831905   
 2025-10-17  2.454131 -0.334380 -0.513442    0.148710           1.175178   
 
          

In [39]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=200, class_weight='balanced', max_depth=3, random_state=42, min_samples_split=8, bootstrap=False)

data_split(X,y,rf_classifier)

              precision    recall  f1-score   support

           0       0.47      0.56      0.51       831
           1       0.54      0.45      0.49       963

    accuracy                           0.50      1794
   macro avg       0.51      0.51      0.50      1794
weighted avg       0.51      0.50      0.50      1794

Train: 0.5575250836120401
Test: 0.5016722408026756
Scores: [0.52006689 0.50724638 0.51950948 0.48885173 0.50167224]
Mean: 0.507469342251951


(                 Close     Volume           RAV  volatility  \
 Date                                                          
 2024-01-04   47.973946  306535000  3.074876e+08    1.179219   
 2024-01-04  237.929993  102629300  1.051806e+08    7.480690   
 2024-01-05  179.658951   62379700  5.715374e+07    5.113708   
 2024-01-05  363.025604   21004600  1.974449e+07    2.724859   
 2024-01-05   49.072388  415039000  3.318931e+08    0.986367   
 ...                ...        ...           ...         ...   
 2025-10-16  428.750000   77189900  7.908717e+07   10.409824   
 2025-10-17  252.289993   48839918  4.235036e+07    3.932802   
 2025-10-17  513.580017   19205931  1.727230e+07    5.672357   
 2025-10-17  183.220001  170079787  1.964499e+08    4.430583   
 2025-10-17  439.309998   87848438  8.146665e+07   10.200710   
 
             Buy_Sell_Strength  Weighted_Strength     Trend   Returns  \
 Date                                                                   
 2024-01-04         

In [40]:
rf_std_classifier = RandomForestClassifier(n_estimators=200, class_weight='balanced', max_depth=3, random_state=42, min_samples_split=3, bootstrap=False)

data_split(X_std,y,rf_std_classifier)

              precision    recall  f1-score   support

           0       0.47      0.74      0.57       831
           1       0.54      0.27      0.36       963

    accuracy                           0.49      1794
   macro avg       0.50      0.50      0.46      1794
weighted avg       0.51      0.49      0.46      1794

Train: 0.558974358974359
Test: 0.4855072463768116
Scores: [0.48829431 0.51282051 0.52787068 0.49219621 0.48550725]
Mean: 0.5013377926421405


(               Close    Volume       RAV  volatility  Buy_Sell_Strength  \
 Date                                                                      
 2024-01-04  0.388764 -0.575734 -0.736575   -0.137591          -0.147154   
 2024-01-04  0.841947 -0.129494 -0.113947   -0.122589          -1.555338   
 2024-01-05  1.059671 -0.737088 -0.974697    0.767476          -0.460527   
 2024-01-05  1.155130 -0.561301 -0.883081   -0.538584          -1.043880   
 2024-01-05  0.412784 -0.147318 -0.612487   -0.219023           0.318787   
 ...              ...       ...       ...         ...                ...   
 2025-10-16  2.369591 -0.482123 -0.553527    0.169567          -0.344939   
 2025-10-17  2.077000 -0.940870 -1.239838    0.319771           0.935279   
 2025-10-17  2.238852 -0.701556 -1.140122    0.157414           0.736392   
 2025-10-17  3.346303 -1.114512 -1.301137    1.235299           0.831905   
 2025-10-17  2.454131 -0.334380 -0.513442    0.148710           1.175178   
 
          

In [41]:
feature_df = pd.DataFrame({
    'feature': X.columns,
    'importance_before': rf_classifier.feature_importances_,
    'importance_after': rf_std_classifier.feature_importances_,
}).sort_values('importance_before', ascending=False)
feature_df

Unnamed: 0,feature,importance_before,importance_after
5,Weighted_Strength,0.116942,0.101905
15,mean_return_others,0.108557,0.095117
26,market_mean_return,0.108505,0.080497
13,market_return,0.102726,0.089236
2,RAV,0.095791,0.047943
4,Buy_Sell_Strength,0.066529,0.050204
19,lag_market_return,0.058597,0.053793
20,market_std,0.050048,0.058445
25,close_z,0.0437,0.026503
21,zscore_vs_market,0.036914,0.019167


In [42]:
feature_df['feature'].unique()

array(['Weighted_Strength', 'mean_return_others', 'market_mean_return',
       'market_return', 'RAV', 'Buy_Sell_Strength', 'lag_market_return',
       'market_std', 'close_z', 'zscore_vs_market', 'Close', 'vol_3',
       'rel_return', 'Returns', 'volatility', 'Log_returns', 'return_2',
       'return_1', 'Volume', 'divergence', 'Trend', 'return_rank', 'TSLA',
       'NVDA', 'volume_rank', 'MSFT', 'AAPL'], dtype=object)

In [43]:
X_2_features = ['market_return', 'Weighted_Strength', 'mean_return_others','TSLA', 'AAPL', 'NVDA', 'MSFT']
X_2_std_features = ['market_return', 'Weighted_Strength', 'mean_return_others','RAV','TSLA', 'AAPL', 'NVDA', 'MSFT']
X_2 = X[X_2_features]
X_2_std = X_std[X_2_std_features]
X_2.head()

Unnamed: 0_level_0,market_return,Weighted_Strength,mean_return_others,TSLA,AAPL,NVDA,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-02-06,-0.007546,-0.162635,-0.007546,False,True,False,False
2015-02-06,-0.007546,-0.06367,-0.007546,False,False,False,True
2015-02-06,-0.007546,-0.268543,-0.007546,False,False,True,False
2015-02-06,-0.007546,-0.332912,-0.007546,True,False,False,False
2015-02-09,0.001259,0.28711,0.001259,False,True,False,False


In [44]:
X_2_std.head()

Unnamed: 0_level_0,market_return,Weighted_Strength,mean_return_others,RAV,TSLA,AAPL,NVDA,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-02-06,-0.007546,-0.57152,-0.455213,2.494078,False,True,False,False
2015-02-06,-0.007546,-0.304136,-0.455213,2.363753,False,False,False,True
2015-02-06,-0.007546,-0.919952,-0.455213,-1.203843,False,False,True,False
2015-02-06,-0.007546,-1.055959,-0.455213,-0.961618,True,False,False,False
2015-02-09,0.001259,0.788259,-0.02093,2.027918,False,True,False,False


In [None]:
rf_classifier_2 = RandomForestClassifier(n_estimators=200, class_weight='balanced', max_depth=3, random_state=42, min_samples_split=8, min_samples_leaf=4, bootstrap=False)
data_split(X_2,y,rf_classifier_2)
data_split(X_2_std,y,rf_classifier_2)

              precision    recall  f1-score   support

           0       0.48      0.50      0.49       831
           1       0.55      0.53      0.54       963

    accuracy                           0.52      1794
   macro avg       0.52      0.52      0.52      1794
weighted avg       0.52      0.52      0.52      1794

Train: 0.5363433667781494
Test: 0.5172798216276477
Scores: [0.50111483 0.49052397 0.50055741 0.50724638 0.51727982]
Mean: 0.5033444816053512


In [None]:
xgb_2 = XGBClassifier(
    n_estimators=200,     
    learning_rate=0.08,    
    max_depth=8,
    min_child_weight=30,
    subsample=0.1,        
    colsample_bytree=0.4,
    gamma=0,              
    reg_alpha=2.4,          
    reg_lambda=5.4,         
    random_state=42
)
data_split(X_2,y,xgb)
data_split(X_2,y,xgb_2)
# data_split(X_2_std,y,xgb)
X_test, y_test = data_split(X_2_std,y,xgb_2)

In [None]:
# X_new = pd.read_csv("../data/engineered_interleaved_features_multi_stock_data.csv")
# X_new.Date = pd.to_datetime(X_new["Date"], errors="coerce")
# X_new['dayofweek'] = X_new['Date'].dt.dayofweek
# X_new['month'] = X_new['Date'].dt.month
# X_new['quarter'] = X_new['Date'].dt.quarter
# X_new['is_month_start'] = X_new['Date'].dt.is_month_start.astype(int)
# X_new['is_month_end'] = X_new['Date'].dt.is_month_end.astype(int)
# X_new.drop(['Target','Date','Unnamed: 0'],axis="columns",inplace=True)

In [None]:
# data_split(X_new,y,xgb)

In [None]:
# data_split(X_new,y,rf_classifier_2)

In [None]:
import joblib
rf_file_path = "../models/rf_interleaved_stock_model.joblib"
joblib.dump(rf_classifier_2, rf_file_path)