In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import yfinance as yf
import sys
from datetime import datetime
sys.path.append('../data_func')

from data_helper_functions import create_study_periods

In [2]:
df=pd.read_csv('../data/crsp_ff_adjusted.csv)
df.drop(columns=['Unnamed: 0'],inplace=True)
df['date'] = pd.to_datetime(df['date'])
df.dropna(subset=['RET'],inplace=True)

In [None]:
#select returns to use
#select returns to use
returns='RET'
df=df[['date','TICKER',f'{returns}']]
if returns!='RET':
    #rename returns column
    df.rename(columns={f'{returns}':'RET'},inplace=True)

In [3]:
#Optional parameter target_type: 'cross_sectional_median(default)','buckets(10 buckets)','raw_returns'.
study_periods=create_study_periods(df,n_periods=23,window_size=240,trade_size=250,train_size=750,forward_roll=250,start_date=datetime(1990,1,1),end_date=datetime(2015,12,31))

  0%|          | 0/38 [00:00<?, ?it/s]

 89%|████████▉ | 34/38 [00:05<00:00,  5.82it/s]

Reached the end of the dataset.





## Cross Sectional Median Model

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
from tqdm import tqdm
import numpy as np

# Calculate class weights

clf = RandomForestClassifier(n_estimators=1000, max_depth=20, random_state=42)

In [11]:
# Creating separate DataFrames for in-sample and out-of-sample portfolios
in_sample_long_portfolios = pd.DataFrame()
out_of_sample_long_portfolios = pd.DataFrame()

in_sample_short_portfolios = pd.DataFrame()
out_of_sample_short_portfolios = pd.DataFrame()

# Assuming study_periods contains the train-test split dataframes
for train_df, test_df in tqdm(study_periods):
    X_train = train_df[['standardized_return']]
    y_train = train_df['target'].replace({0: 0, 1: 1}).astype(int)

    X_test = test_df[['standardized_return']]
    y_test = test_df['target'].replace({0: 0, 1: 1}).astype(int)
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
    class_weights_dict = {0: class_weights[0], 1: class_weights[1] * 5}  # Increase the weight for class 1 as needed

    clf = RandomForestClassifier(n_estimators=1000, max_depth=20, random_state=42, class_weight=class_weights_dict)

    # Train the classifier
    clf.fit(X_train, y_train)

    # Predict on both train and test data
    train_probs = clf.predict_proba(X_train)[:, 1]
    test_probs = clf.predict_proba(X_test)[:, 1]

    # Assign predicted probabilities back to train and test dataframes
    train_df['predicted_prob'] = train_probs
    test_df['predicted_prob'] = test_probs

    k = 10

    # Iterate over unique dates in train_df for in-sample data
    for date in train_df['date'].unique():
        date_data = train_df[train_df['date'] == date]
        sorted_data = date_data.sort_values(by='predicted_prob', ascending=False)

        long_tickers = sorted_data.head(k)[['date', 'TICKER', 'predicted_prob']]
        short_tickers = sorted_data.tail(k)[['date', 'TICKER', 'predicted_prob']]

        in_sample_long_portfolios = pd.concat([in_sample_long_portfolios, long_tickers])
        in_sample_short_portfolios = pd.concat([in_sample_short_portfolios, short_tickers])

    # Iterate over unique dates in test_df for out-of-sample data
    for date in test_df['date'].unique():
        date_data = test_df[test_df['date'] == date]
        sorted_data = date_data.sort_values(by='predicted_prob', ascending=False)

        long_tickers = sorted_data.head(k)[['date', 'TICKER', 'predicted_prob']]
        short_tickers = sorted_data.tail(k)[['date', 'TICKER', 'predicted_prob']]

        out_of_sample_long_portfolios = pd.concat([out_of_sample_long_portfolios, long_tickers])
        out_of_sample_short_portfolios = pd.concat([out_of_sample_short_portfolios, short_tickers])


100%|██████████| 34/34 [1:13:25<00:00, 129.57s/it]


# Portfolios

In [1]:
#pickle the porfolios and returns df
in_sample_long_portfolios.to_csv('../data/rf_results/in_sample_long_portfolios.csv')
in_sample_short_portfolios.to_csv('../data/rf_results/in_sample_short_portfolios.csv')
out_of_sample_long_portfolios.to_csv('../data/rf_results/out_of_sample_long_portfolios.csv')
out_of_sample_short_portfolios.to_csv('../data/rf_results/out_of_sample_short_portfolios.csv')


NameError: name 'in_sample_long_portfolios' is not defined

# Raw Returns

In [29]:
import pandas as pd
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Creating separate DataFrames for in-sample and out-of-sample portfolios
in_sample_long_portfolios = pd.DataFrame()
out_of_sample_long_portfolios = pd.DataFrame()

in_sample_short_portfolios = pd.DataFrame()
out_of_sample_short_portfolios = pd.DataFrame()

# Assuming study_periods contains the train-test split dataframes
for train_df, test_df in tqdm(study_periods):
    X_train = train_df[['standardized_return']]
    y_train = train_df['RET']  # Change to 'RET' to predict actual returns

    X_test = test_df[['standardized_return']]
    y_test = test_df['RET']  # Change to 'RET' to predict actual returns

    regr = RandomForestRegressor(n_estimators=1000, max_depth=20, random_state=42)

    # Train the regressor
    regr.fit(X_train, y_train)

    # Predict on both train and test data
    train_preds = regr.predict(X_train)
    test_preds = regr.predict(X_test)

    # Assign predicted returns back to train and test dataframes
    train_df['predicted_return'] = train_preds
    test_df['predicted_return'] = test_preds

    k = 10  # Number of positions to take for both long and short portfolios

    # Iterate over unique dates in train_df for in-sample data
    for date in train_df['date'].unique():
        date_data = train_df[train_df['date'] == date]

        # Select the top k stocks with the highest predicted return
        long_tickers = date_data.nlargest(k, 'predicted_return')[['date', 'TICKER', 'predicted_return']]
        # Select the bottom k stocks with the lowest predicted return
        short_tickers = date_data.nsmallest(k, 'predicted_return')[['date', 'TICKER', 'predicted_return']]

        in_sample_long_portfolios = pd.concat([in_sample_long_portfolios, long_tickers])
        in_sample_short_portfolios = pd.concat([in_sample_short_portfolios, short_tickers])

    # Iterate over unique dates in test_df for out-of-sample data
    for date in test_df['date'].unique():
        date_data = test_df[test_df['date'] == date]

        # Select the top k stocks with the highest predicted return
        long_tickers = date_data.nlargest(k, 'predicted_return')[['date', 'TICKER', 'predicted_return']]
        # Select the bottom k stocks with the lowest predicted return
        short_tickers = date_data.nsmallest(k, 'predicted_return')[['date', 'TICKER', 'predicted_return']]

        out_of_sample_long_portfolios = pd.concat([out_of_sample_long_portfolios, long_tickers])
        out_of_sample_short_portfolios = pd.concat([out_of_sample_short_portfolios, short_tickers])


100%|██████████| 34/34 [5:27:06<00:00, 577.25s/it]   


In [None]:
#Optional parameter target_type: 'cross_sectional_median(default)','buckets(10 buckets)','raw_returns'.
study_periods=create_study_periods(df,n_periods=23,window_size=240,trade_size=250,train_size=750,forward_roll=250,start_date=datetime(1990,1,1),end_date=datetime(2015,12,31))