In [5]:
#import xgboost
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import yfinance as yf
import sys
from datetime import datetime
sys.path.append('../data_func')

from data_helper_functions import create_study_periods
import xgboost as xgb

In [6]:
df=pd.read_csv('../data/crsp_ff_adjusted.csv')
df.drop(columns=['Unnamed: 0'],inplace=True)
df['date'] = pd.to_datetime(df['date'])
df.dropna(subset=['RET'],inplace=True)

In [7]:
study_periods=create_study_periods(df,n_periods=23,window_size=240,trade_size=250,train_size=750,forward_roll=250,start_date=datetime(1990,1,1),end_date=datetime(2015,12,31),target_type='cross_sectional_median')

 89%|████████▉ | 34/38 [00:10<00:01,  3.38it/s]


Reached the end of the dataset.


In [9]:
# Creating separate DataFrames for in-sample and out-of-sample portfolios
in_sample_long_portfolios = pd.DataFrame()
out_of_sample_long_portfolios = pd.DataFrame()

in_sample_short_portfolios = pd.DataFrame()
out_of_sample_short_portfolios = pd.DataFrame()


for train_df, test_df in tqdm(study_periods):
    X_train = train_df[['standardized_return']]
    y_train = train_df['target'].replace({0: 0, 1: 1}).astype(int)

    X_test = test_df[['standardized_return']]
    y_test = test_df['target'].replace({0: 0, 1: 1}).astype(int)

    # Calculate class weights
    num_neg, num_pos = np.bincount(y_train)
    scale_pos_weight = (num_neg / num_pos) * 5  # Multiply by 5 as needed

    clf = xgb.XGBClassifier(n_estimators=1000, max_depth=20, random_state=42, scale_pos_weight=scale_pos_weight, use_label_encoder=False, eval_metric='logloss')

    # Train the classifier
    clf.fit(X_train, y_train)

    # Predict on both train and test data
    train_probs = clf.predict_proba(X_train)[:, 1]
    test_probs = clf.predict_proba(X_test)[:, 1]

    # Assign predicted probabilities back to train and test dataframes
    train_df['predicted_prob'] = train_probs
    test_df['predicted_prob'] = test_probs


    k = 10

    # Iterate over unique dates in train_df for in-sample data
    for date in train_df['date'].unique():
        date_data = train_df[train_df['date'] == date]
        sorted_data = date_data.sort_values(by='predicted_prob', ascending=False)

        long_tickers = sorted_data.head(k)[['date', 'TICKER', 'predicted_prob']]
        short_tickers = sorted_data.tail(k)[['date', 'TICKER', 'predicted_prob']]

        in_sample_long_portfolios = pd.concat([in_sample_long_portfolios, long_tickers])
        in_sample_short_portfolios = pd.concat([in_sample_short_portfolios, short_tickers])

    # Iterate over unique dates in test_df for out-of-sample data
    for date in test_df['date'].unique():
        date_data = test_df[test_df['date'] == date]
        sorted_data = date_data.sort_values(by='predicted_prob', ascending=False)

        long_tickers = sorted_data.head(k)[['date', 'TICKER', 'predicted_prob']]
        short_tickers = sorted_data.tail(k)[['date', 'TICKER', 'predicted_prob']]

        out_of_sample_long_portfolios = pd.concat([out_of_sample_long_portfolios, long_tickers])
        out_of_sample_short_portfolios = pd.concat([out_of_sample_short_portfolios, short_tickers])

  0%|          | 0/34 [00:00<?, ?it/s]

100%|██████████| 34/34 [01:27<00:00,  2.58s/it]


In [10]:
#export portfolios
in_sample_long_portfolios.to_csv('../data/xgb_results/in_sample_long_portfolios.csv')
out_of_sample_long_portfolios.to_csv('../data/xgb_results/out_of_sample_long_portfolios.csv')
in_sample_short_portfolios.to_csv('../data/xgb_results/in_sample_short_portfolios.csv')
out_of_sample_short_portfolios.to_csv('../data/xgb_results/out_of_sample_short_portfolios.csv')