In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

In [5]:
# Iterate through existing models and load them in
PATH= "models"
ext=".joblib"
models = {}
stocks = []
for model in os.listdir(PATH):
    file_path = os.path.join(PATH, model)
    
    # Check if it's a proper model
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1] == ext:
        stock = os.path.splitext(file_path)[0].split("/")[1]
        stocks.append(stock)
        models[stock] = joblib.load(file_path)

In [6]:
list(models.values())

[<xgboost.core.Booster at 0x105d1c910>,
 <xgboost.core.Booster at 0x103f74f70>,
 <xgboost.core.Booster at 0x103f74b20>,
 <xgboost.core.Booster at 0x281d0af70>,
 <xgboost.core.Booster at 0x281d0afd0>,
 <xgboost.core.Booster at 0x281d0af10>,
 <xgboost.core.Booster at 0x281d0aeb0>,
 <xgboost.core.Booster at 0x281d0ae50>,
 <xgboost.core.Booster at 0x281d0adc0>,
 <xgboost.core.Booster at 0x103e40310>]

In [7]:
stocks

['TROW', 'DVN', 'TPR', 'NVR', 'CSCO', 'CE', 'ISRG', 'UAL', 'BA', 'MRO']

In [23]:
start_date = '2000-01-01'
end_date = '2021-12-31'

In [27]:
features = ["Adj Close", "ATR", "RSI", "Volume", "Adj Close_lag1", "Adj Close_lag2"]
def generate_data(df, start_date, end_date):
    df["Adj Close_lag1"]=df["Adj Close"].shift(1)    
    df["Adj Close_lag2"]=df["Adj Close"].shift(2)
    df = df[features]    
    df = df.dropna()
    df = df[start_date:end_date]
    return df

In [34]:
FILEPATH = "/Users/alvin/Documents/GitHub/AY2324S1-ML-Empowered-Stat-Arb/Data/individual_data/"
truth = {}
pred = {}
indices = None
assigned = False
for stock in stocks:
    stock_df = pd.read_csv(FILEPATH+f"{stock}.csv")
    stock_df['Date'] = pd.to_datetime(stock_df['Date'])
    stock_df.set_index('Date', inplace=True)
    test = generate_data(stock_df, start_date=start_date, end_date=end_date)    
    X_test = test.drop("Adj Close", axis=1)
    y_test = test["Adj Close"]    
    if not assigned:
        indices = y_test.index
        assigned = True
    trained_model = models[stock]
    y_pred = trained_model.predict(xgb.DMatrix(X_test))
    truth[stock] = y_test
    pred[stock] = y_pred


In [35]:
indices

DatetimeIndex(['2009-01-06', '2009-01-07', '2009-01-08', '2009-01-09',
               '2009-01-12', '2009-01-13', '2009-01-14', '2009-01-15',
               '2009-01-16', '2009-01-20',
               ...
               '2019-12-17', '2019-12-18', '2019-12-19', '2019-12-20',
               '2019-12-23', '2019-12-24', '2019-12-26', '2019-12-27',
               '2019-12-30', '2019-12-31'],
              dtype='datetime64[ns]', name='Date', length=2766, freq=None)

In [36]:
pred_df = pd.DataFrame({key: pd.Series(value) for key, value in pred.items()})
true_df = pd.DataFrame({key: pd.Series(value) for key, value in truth.items()})
pred_df.index=indices
true_df.index=indices
true_df_excluding_last_day = true_df.drop(index = indices[-1])
pred_df_excluding_first_day = pred_df.drop(index = indices[0])

In [39]:
def generate_portfolio_weights(truth, pred):
    index = pred.index
    cols = pred.columns
    truth_values = truth.values
    pred_values = pred.values
    res = (pred_values - truth_values)/truth_values
    res_sums = np.sum(res, axis=1)[:, np.newaxis]
    scaled_res = res/res_sums
    weights = pd.DataFrame(data=scaled_res, columns=cols, index=index)
    return weights
    

In [46]:
portfolio_weights = generate_portfolio_weights(true_df_excluding_last_day, pred_df_excluding_first_day)


In [43]:
true_df_excluding_first_day = true_df.iloc[1:]

In [48]:
portfolio_weights.to_csv("weights.csv")
true_df_excluding_first_day.to_csv("truth.csv")