In [1]:
#import xgboost
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import yfinance as yf
import sys
from datetime import datetime
sys.path.append('../data_func')

from data_helper_functions import create_study_periods
import xgboost as xgb

In [2]:
df=pd.read_csv('../data/crsp_ff_adjusted.csv')
df.drop(columns=['Unnamed: 0'],inplace=True)
df['date'] = pd.to_datetime(df['date'])
df.dropna(subset=['RET'],inplace=True)

In [3]:
study_periods=create_study_periods(df,n_periods=23,window_size=240,trade_size=250,train_size=750,forward_roll=250,start_date=datetime(1990,1,1),end_date=datetime(2015,12,31),target_type='buckets')

  0%|          | 0/38 [00:00<?, ?it/s]

 89%|████████▉ | 34/38 [00:48<00:05,  1.42s/it]

Reached the end of the dataset.





In [4]:
study_periods[0][0]

Unnamed: 0,date,TICKER,RET,rolling_mean,standardized_return,target
0,1990-02-01,SUNW,0.012903,0.012903,5.171781,9.0
1,1990-02-01,MYG,0.014085,0.014085,5.674265,9.0
2,1990-02-01,INTC,-0.012658,-0.012658,-5.694534,0.0
3,1990-02-01,CB,0.005634,0.005634,2.081634,9.0
4,1990-02-01,BUD,-0.026490,-0.026490,-11.574699,0.0
...,...,...,...,...,...,...
248617,1992-01-20,AEP,-0.011628,0.000949,0.089889,9.0
248618,1992-01-20,FSI,-0.025575,0.000171,-0.240946,4.0
248619,1992-01-20,BF,0.004484,0.000464,-0.116027,8.0
248620,1992-01-20,MBI,-0.018041,0.001900,0.494370,7.0


In [6]:
#EDIT CODE TO DO REGRESSION or CLASSIFICATION
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import accuracy_score

def train_and_evaluate_rf(study_periods):
    in_sample_accuracies = []
    out_of_sample_accuracies = []

    for train_df, test_df in tqdm(study_periods):
        train_df.dropna(subset=['target'], inplace=True)
        test_df.dropna(subset=['target'], inplace=True)

        # Update X_train, y_train, X_test, y_test after dropping NaNs
        X_train = train_df[['standardized_return']]
        y_train = train_df['target']
        X_test = test_df[['standardized_return']]
        y_test = test_df['target']

        # Initialize the RandomForestClassifier
        clf = xgb.XGBClassifier(n_estimators=100, max_depth=20,num_class=10)

        # Train the classifier
        clf.fit(X_train, y_train)

        # Predict on both train and test data
        train_preds = clf.predict(X_train)
        test_preds = clf.predict(X_test)

        # Calculate accuracies
        train_accuracy = accuracy_score(y_train, train_preds)
        test_accuracy = accuracy_score(y_test, test_preds)

        in_sample_accuracies.append(train_accuracy)
        out_of_sample_accuracies.append(test_accuracy)

    # Return the average accuracies
    avg_train_accuracy = sum(in_sample_accuracies) / len(in_sample_accuracies)
    avg_test_accuracy = sum(out_of_sample_accuracies) / len(out_of_sample_accuracies)

    return avg_train_accuracy, avg_test_accuracy

# Example Usage
# Assuming study_periods contains the train-test split dataframes
avg_train_accuracy, avg_test_accuracy = train_and_evaluate_rf(study_periods)
print("Average Train Accuracy:", avg_train_accuracy)
print("Average Test Accuracy:", avg_test_accuracy)


  0%|          | 0/34 [00:00<?, ?it/s]

100%|██████████| 34/34 [07:53<00:00, 13.93s/it]

Average Train Accuracy: 0.2646486534862535
Average Test Accuracy: 0.19489763615185296





In [13]:
# Creating separate DataFrames for in-sample and out-of-sample portfolios
in_sample_long_portfolios = pd.DataFrame()
out_of_sample_long_portfolios = pd.DataFrame()

in_sample_short_portfolios = pd.DataFrame()
out_of_sample_short_portfolios = pd.DataFrame()


for train_df, test_df in tqdm(study_periods):
    X_train = train_df[['standardized_return']]
    y_train = train_df['target'].replace({0: 0, 1: 1}).astype(int)

    X_test = test_df[['standardized_return']]
    y_test = test_df['target'].replace({0: 0, 1: 1}).astype(int)

    # Calculate class weights
    num_neg, num_pos = np.bincount(y_train)
    scale_pos_weight = (num_neg / num_pos) * 5  # Multiply by 5 as needed

    # clf = xgb.XGBClassifier(n_estimators=1000, max_depth=20, random_state=42, scale_pos_weight=scale_pos_weight, use_label_encoder=False, eval_metric='logloss')
    reg=xgb.XGBRegressor(n_estimators=1000, max_depth=20, random_state=42, scale_pos_weight=scale_pos_weight, use_label_encoder=False, eval_metric='logloss')
    # Train the classifier
    # clf.fit(X_train, y_train)
    reg=reg.fit(X_train,y_train)

    # Predict on both train and test data
    train_probs = clf.predict_proba(X_train)[:, 1]
    test_probs = clf.predict_proba(X_test)[:, 1]

    # Assign predicted probabilities back to train and test dataframes
    train_df['predicted_prob'] = train_probs
    test_df['predicted_prob'] = test_probs


    k = 10

    # Iterate over unique dates in train_df for in-sample data
    for date in train_df['date'].unique():
        date_data = train_df[train_df['date'] == date]
        sorted_data = date_data.sort_values(by='predicted_prob', ascending=False)

        long_tickers = sorted_data.head(k)[['date', 'TICKER', 'predicted_prob']]
        short_tickers = sorted_data.tail(k)[['date', 'TICKER', 'predicted_prob']]

        in_sample_long_portfolios = pd.concat([in_sample_long_portfolios, long_tickers])
        in_sample_short_portfolios = pd.concat([in_sample_short_portfolios, short_tickers])

    # Iterate over unique dates in test_df for out-of-sample data
    for date in test_df['date'].unique():
        date_data = test_df[test_df['date'] == date]
        sorted_data = date_data.sort_values(by='predicted_prob', ascending=False)

        long_tickers = sorted_data.head(k)[['date', 'TICKER', 'predicted_prob']]
        short_tickers = sorted_data.tail(k)[['date', 'TICKER', 'predicted_prob']]

        out_of_sample_long_portfolios = pd.concat([out_of_sample_long_portfolios, long_tickers])
        out_of_sample_short_portfolios = pd.concat([out_of_sample_short_portfolios, short_tickers])

100%|██████████| 34/34 [02:05<00:00,  3.70s/it]


In [14]:
#export portfolios
in_sample_long_portfolios.to_csv('../data/xgb_results/in_sample_long_portfolios.csv')
out_of_sample_long_portfolios.to_csv('../data/xgb_results/out_of_sample_long_portfolios.csv')
in_sample_short_portfolios.to_csv('../data/xgb_results/in_sample_short_portfolios.csv')
out_of_sample_short_portfolios.to_csv('../data/xgb_results/out_of_sample_short_portfolios.csv')