In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import yfinance as yf
import sys
from datetime import datetime
sys.path.append('../data_func')

from data_helper_functions import create_study_periods

In [48]:
def load_data(path,data_type,start=None):
    df = pd.read_csv(path)
    df['date'] = pd.to_datetime(df['date'])
    columns_to_keep =data_type + ['TICKER','date']
    # List of columns to drop
    columns_to_drop = df.columns.difference(columns_to_keep)

    df = df.drop(columns=columns_to_drop)

    df=df.dropna(subset=data_type)
    if start is not None:
        start_date=start
    else:
        start_date=df['date'].min()
    end_date=df['date'].max()

    return df,start_date,end_date

In [49]:
 features=['RET']

In [50]:
path='../data/stock_data_with_factors.csv'
# path='data/spy_universe.csv'
# path='data/corrected_crsp_ff_adjusted.csv'
start=datetime(2010,1,1)
df,start_date,end_date=load_data(path,features,start)
df = df[df['TICKER'].isin(['AAPL','MSFT','AMZN','GOOG','IBM'])]
target='buckets'

In [51]:
#OPTIONAL TRUNCATE DATA
df = df[df['date'] >= datetime(2012, 1, 1)]

In [52]:
df

Unnamed: 0,date,TICKER,RET
2769569,2012-01-03,IBM,0.013161
2769635,2012-01-03,AAPL,0.015383
2769655,2012-01-03,AMZN,0.034258
2769691,2012-01-03,GOOG,0.030206
2769932,2012-01-03,MSFT,0.031009
...,...,...,...
3274734,2015-12-31,IBM,-0.012344
3274841,2015-12-31,MSFT,-0.014740
3274953,2015-12-31,GOOG,-0.015720
3274976,2015-12-31,AAPL,-0.019195


In [53]:
#Optional parameter target_type: 'cross_sectional_median(default)','buckets(10 buckets)','raw_returns'.
study_periods = create_study_periods(df, window_size=240, trade_size=250, train_size=750, forward_roll=250, 
                                         start_date=start_date, end_date=end_date, target_type=target,data_type='RET',apply_wavelet_transform=False)

 56%|█████▌    | 5/9 [00:00<00:00, 23.76it/s]

Reached the end of the dataset.





In [54]:
study_periods[0][0]

Unnamed: 0,date,TICKER,standardized_data,target
2769569,2012-01-03,IBM,0.500865,9.0
2769635,2012-01-03,AAPL,0.613043,9.0
2769655,2012-01-03,AMZN,1.565951,9.0
2769691,2012-01-03,GOOG,1.361385,9.0
2769932,2012-01-03,MSFT,1.401925,9.0
...,...,...,...,...
2775645,2012-01-20,AMZN,-1.077453,0.0
2775656,2012-01-20,MSFT,2.691013,9.0
2775858,2012-01-20,GOOG,-4.392967,0.0
2775969,2012-01-20,IBM,2.073731,9.0


In [55]:
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
from tqdm import tqdm
import numpy as np

# Calculate class weights

# clf = RandomForestClassifier(n_estimators=1000, max_depth=20, random_state=42)


# reg = RandomForestRegressor(n_estimators=1000, max_depth=20, random_state=42)


In [56]:
#EDIT CODE TO DO REGRESSION or CLASSIFICATION
from tqdm import tqdm
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def train_and_evaluate_rf(study_periods):
    in_sample_accuracies = []
    out_of_sample_accuracies = []

    for train_df, test_df in tqdm(study_periods):
        X_train = train_df[['standardized_data']]
        y_train = train_df['target']
        X_test = test_df[['standardized_data']]
        y_test = test_df['target']

        # Initialize the RandomForestClassifier
        clf = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42)

        # Train the classifier
        clf.fit(X_train, y_train)

        # Predict on both train and test data
        train_preds = clf.predict(X_train)
        test_preds = clf.predict(X_test)

        # Calculate accuracies
        train_accuracy = accuracy_score(y_train, train_preds)
        test_accuracy = accuracy_score(y_test, test_preds)

        in_sample_accuracies.append(train_accuracy)
        out_of_sample_accuracies.append(test_accuracy)

    # Return the average accuracies
    avg_train_accuracy = sum(in_sample_accuracies) / len(in_sample_accuracies)
    avg_test_accuracy = sum(out_of_sample_accuracies) / len(out_of_sample_accuracies)

    return avg_train_accuracy, avg_test_accuracy

# Example Usage
# Assuming study_periods contains the train-test split dataframes
avg_train_accuracy, avg_test_accuracy = train_and_evaluate_rf(study_periods)
print("Average Train Accuracy:", avg_train_accuracy)
print("Average Test Accuracy:", avg_test_accuracy)


  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:01<00:00,  4.02it/s]

Average Train Accuracy: 0.9936042292187579
Average Test Accuracy: 0.5347668492891671





In [None]:
import pandas as pd

def validate_study_periods(study_periods):
    validation_results = []

    for i, (train_df, test_df) in enumerate(study_periods):
        result = {'Period': i+1, 'Issues': []}

        # Check for missing values
        if train_df.isnull().any().any() or test_df.isnull().any().any():
            result['Issues'].append("Missing values detected")

        # Inspect for outliers in key columns (customize as per your data)
        for column in ['RET', 'rolling_mean', 'standardized_return']:
            if train_df[column].abs().max() > 100 or test_df[column].abs().max() > 100:  # Adjust threshold as needed
                result['Issues'].append(f"Potential outliers in {column}")

        # Verify data types
        if not pd.api.types.is_datetime64_any_dtype(train_df['date']) or not pd.api.types.is_datetime64_any_dtype(test_df['date']):
            result['Issues'].append("Incorrect date data type")

        # Ensure chronological order
        if not (train_df['date'].is_monotonic_increasing and test_df['date'].is_monotonic_increasing):
            result['Issues'].append("Data not in chronological order")

        # Validate no overlap between training and test sets
        if train_df['date'].max() >= test_df['date'].min():
            result['Issues'].append("Overlap between training and test sets")

        validation_results.append(result)

    return validation_results

# Example usage
validation_results = validate_study_periods(study_periods)
for result in validation_results:
    print(f"Period {result['Period']}: {'No issues' if not result['Issues'] else ', '.join(result['Issues'])}")


In [None]:
# # Creating separate DataFrames for in-sample and out-of-sample portfolios
# in_sample_long_portfolios = pd.DataFrame()
# out_of_sample_long_portfolios = pd.DataFrame()

# in_sample_short_portfolios = pd.DataFrame()
# out_of_sample_short_portfolios = pd.DataFrame()

# # Assuming study_periods contains the train-test split dataframes
# for train_df, test_df in tqdm(study_periods):
#     X_train = train_df[['standardized_return']]
#     y_train = train_df['target']

#     X_test = test_df[['standardized_return']]
#     y_test = test_df['target']
#     # class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
#     # class_weights_dict = {0: class_weights[0], 1: class_weights[1] * 5}  # Increase the weight for class 1 as needed

#     # clf = RandomForestClassifier(n_estimators=1000, max_depth=20, random_state=42, class_weight=class_weights_dict)

#     # Train the classifier
#     # clf.fit(X_train, y_train)
#     reg.fit(X_train, y_train)
#     # Predict on both train and test data
#     # train_probs = clf.predict_proba(X_train)[:, 1]
#     # test_probs = clf.predict_proba(X_test)[:, 1]
#     train_preds = reg.predict(X_train)
#     test_preds = reg.predict(X_test)


#     # Assign predicted probabilities back to train and test dataframes
#     # train_df['predicted_prob'] = train_probs
#     # test_df['predicted_prob'] = test_probs
#     train_df['predicted_prob'] = train_preds
#     test_df['predicted_prob'] = test_preds

#     k = 10

#     # Iterate over unique dates in train_df for in-sample data
#     for date in train_df['date'].unique():
#         date_data = train_df[train_df['date'] == date]
#         sorted_data = date_data.sort_values(by='predicted_prob', ascending=False)

#         long_tickers = sorted_data.head(k)[['date', 'TICKER', 'predicted_prob']]
#         short_tickers = sorted_data.tail(k)[['date', 'TICKER', 'predicted_prob']]

#         in_sample_long_portfolios = pd.concat([in_sample_long_portfolios, long_tickers])
#         in_sample_short_portfolios = pd.concat([in_sample_short_portfolios, short_tickers])

#     # Iterate over unique dates in test_df for out-of-sample data
#     for date in test_df['date'].unique():
#         date_data = test_df[test_df['date'] == date]
#         sorted_data = date_data.sort_values(by='predicted_prob', ascending=False)

#         long_tickers = sorted_data.head(k)[['date', 'TICKER', 'predicted_prob']]
#         short_tickers = sorted_data.tail(k)[['date', 'TICKER', 'predicted_prob']]

#         out_of_sample_long_portfolios = pd.concat([out_of_sample_long_portfolios, long_tickers])
#         out_of_sample_short_portfolios = pd.concat([out_of_sample_short_portfolios, short_tickers])


# Accuracy

# Portfolios

In [None]:
#pickle the porfolios and returns df
in_sample_long_portfolios.to_csv('../data/rf_results/in_sample_long_portfolios.csv')
in_sample_short_portfolios.to_csv('../data/rf_results/in_sample_short_portfolios.csv')
out_of_sample_long_portfolios.to_csv('../data/rf_results/out_of_sample_long_portfolios.csv')
out_of_sample_short_portfolios.to_csv('../data/rf_results/out_of_sample_short_portfolios.csv')


# Raw Returns

In [None]:
import pandas as pd
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Creating separate DataFrames for in-sample and out-of-sample portfolios
in_sample_long_portfolios = pd.DataFrame()
out_of_sample_long_portfolios = pd.DataFrame()

in_sample_short_portfolios = pd.DataFrame()
out_of_sample_short_portfolios = pd.DataFrame()

# Assuming study_periods contains the train-test split dataframes
for train_df, test_df in tqdm(study_periods):
    X_train = train_df[['standardized_return']]
    y_train = train_df['RET']  # Change to 'RET' to predict actual returns

    X_test = test_df[['standardized_return']]
    y_test = test_df['RET']  # Change to 'RET' to predict actual returns

    regr = RandomForestRegressor(n_estimators=1000, max_depth=20, random_state=42)

    # Train the regressor
    regr.fit(X_train, y_train)

    # Predict on both train and test data
    train_preds = regr.predict(X_train)
    test_preds = regr.predict(X_test)

    # Assign predicted returns back to train and test dataframes
    train_df['predicted_return'] = train_preds
    test_df['predicted_return'] = test_preds

    k = 10  # Number of positions to take for both long and short portfolios

    # Iterate over unique dates in train_df for in-sample data
    for date in train_df['date'].unique():
        date_data = train_df[train_df['date'] == date]

        # Select the top k stocks with the highest predicted return
        long_tickers = date_data.nlargest(k, 'predicted_return')[['date', 'TICKER', 'predicted_return']]
        # Select the bottom k stocks with the lowest predicted return
        short_tickers = date_data.nsmallest(k, 'predicted_return')[['date', 'TICKER', 'predicted_return']]

        in_sample_long_portfolios = pd.concat([in_sample_long_portfolios, long_tickers])
        in_sample_short_portfolios = pd.concat([in_sample_short_portfolios, short_tickers])

    # Iterate over unique dates in test_df for out-of-sample data
    for date in test_df['date'].unique():
        date_data = test_df[test_df['date'] == date]

        # Select the top k stocks with the highest predicted return
        long_tickers = date_data.nlargest(k, 'predicted_return')[['date', 'TICKER', 'predicted_return']]
        # Select the bottom k stocks with the lowest predicted return
        short_tickers = date_data.nsmallest(k, 'predicted_return')[['date', 'TICKER', 'predicted_return']]

        out_of_sample_long_portfolios = pd.concat([out_of_sample_long_portfolios, long_tickers])
        out_of_sample_short_portfolios = pd.concat([out_of_sample_short_portfolios, short_tickers])


In [None]:
#Optional parameter target_type: 'cross_sectional_median(default)','buckets(10 buckets)','raw_returns'.
study_periods=create_study_periods(df,n_periods=23,window_size=240,trade_size=250,train_size=750,forward_roll=250,start_date=datetime(1990,1,1),end_date=datetime(2015,12,31))