In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from datetime import datetime
import statsmodels.api as sm
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LinearRegression, ElasticNetCV
from sklearn.model_selection import train_test_split

from Strategy import *

In [2]:
df = pd.read_csv("Y:\\Dropbox\\Dropbox (MIT)\\Robinhood Trading\\Stock Data\\broader_stock.csv")

In [3]:
df = df.iloc[:-1]

In [4]:
for tick in df.columns[1:]:
    initpos = df[tick].first_valid_index()
    df[tick].iloc[initpos:] = df[tick].iloc[initpos:].interpolate()

In [5]:
df = df.set_index(pd.to_datetime(df['Date']))
df.drop(['Date'], axis=1, inplace=True)
pct_df = df.pct_change().shift(1).iloc[2:]

In [6]:
train_dta, test_dta = train_test_split(pct_df, test_size=0.01, shuffle=False)

### Partial out $R_m$ from individual return series
Using only the past 200 observations

In [26]:
residual = []

In [27]:
for tick in train_dta.columns[::3][:50]:
    rm = train_dta['SPY_Open']
    rint = df['^TNX_Open'].loc[train_dta.index]
    ri = train_dta[tick]
    temp = pd.concat([ri, rm, rint], axis=1).dropna().iloc[-1000:]

    y = temp[temp.columns[0]].values
    X = temp[temp.columns[1:]].values

    reg = ElasticNetCV(cv=10, n_jobs=-1).fit(X, y)
    yhat = reg.predict(X)
    eps = y - yhat
    residual.append(eps)

In [28]:
RES = np.column_stack(residual)

### Use different dimension reduction techniques to combine the features

In [14]:
from sklearn.decomposition import SparsePCA, PCA, FactorAnalysis

In [29]:
decom_method = [SparsePCA(n_jobs=-1), PCA(), FactorAnalysis()]
decom_dict = {}

In [30]:
for method in decom_method:
    decom = method.fit_transform(RES)
    exp_var = np.var(decom, axis=0, ddof=1) / sum(np.var(decom, axis=0, ddof=1))
    ttl_var = float()
    comp = 0
    while ttl_var <= 0.9:
        ttl_var += exp_var[comp]
        comp += 1
    decom_dict[method] = decom[:, :comp]

### Ensemble method for Prediction

In [86]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, uniform, randint

In [96]:
param_grid = {
    'n_estimators': randint(low=5,high=30),
    'max_depth': randint(low=5,high=20),
    'min_samples_split': randint(low=10,high=50),
    'ccp_alpha': scipy.stats.expon(scale=.1)
}

estimator = [GradientBoostingClassifier(), RandomForestClassifier(n_jobs=-1)]

In [99]:
tick = 'TQQQ_Open'
candidate_models = []

In [98]:
Y = train_dta[tick].shift(-1).dropna().values
Y = (Y > 0).astype(int)
for method in list(decom_dict):
    hist_length = min((Y.shape[0], decom_dict[method].shape[0]))
    feature_X = decom_dict[method][-hist_length-1:-1,]
    feature_Y = Y[-hist_length+1:,]
    for est in estimator:
        clf = RandomizedSearchCV(est, param_grid)
        search = clf.fit(feature_X, feature_Y)
        print("method:", method, "estimator:", est)
        print(search.best_score_, search.best_estimator_)
        print("\n")
        candidate_models.append(search.best_estimator_)

method: SparsePCA(n_jobs=-1) estimator: GradientBoostingClassifier()
0.5835829145728643 GradientBoostingClassifier(ccp_alpha=0.11161454530671927, max_depth=18,
                           min_samples_split=22, n_estimators=29)


method: SparsePCA(n_jobs=-1) estimator: RandomForestClassifier(n_jobs=-1)
0.5835829145728643 RandomForestClassifier(ccp_alpha=0.16729005578804138, max_depth=6,
                       min_samples_split=19, n_estimators=15, n_jobs=-1)


method: PCA() estimator: GradientBoostingClassifier()
0.5835829145728643 GradientBoostingClassifier(ccp_alpha=0.041783726615162935, max_depth=7,
                           min_samples_split=35, n_estimators=16)


method: PCA() estimator: RandomForestClassifier(n_jobs=-1)
0.5835829145728643 RandomForestClassifier(ccp_alpha=0.2186444370398454, max_depth=7,
                       min_samples_split=20, n_estimators=13, n_jobs=-1)


method: FactorAnalysis() estimator: GradientBoostingClassifier()
0.5835829145728643 GradientBoostingClass

In [61]:
feature_X.shape

(999, 10)

In [65]:
feature_Y.shape

(999,)

In [51]:
Y.shape[0]

2748

In [91]:
randint(low=5,high=50)

<scipy.stats._distn_infrastructure.rv_frozen at 0x18361f6bc08>

In [None]:
for tick in train_dta.columns[::3][:50]:
    Y = train_dta[tick].shift(-1).values
    Y = (Y > 0).astype(int)
    for method in list(decom_dict):
        hist_length = min((Y.shape[0], decom_dict[method].shape[0]))
        feature_X = decom_dict[method][-hist_length:-1, :]
        

In [49]:
train_dta

Unnamed: 0_level_0,TQQQ_Open,TQQQ_Close,TQQQ_Volume,AME_Open,AME_Close,AME_Volume,NOW_Open,NOW_Close,NOW_Volume,XYL_Open,...,BABA_Volume,NIO_Open,NIO_Close,NIO_Volume,UVXY_Open,UVXY_Close,UVXY_Volume,^TNX_Open,^TNX_Close,^TNX_Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-05,,,,0.000000,-0.004167,-0.558548,,,,,...,,,,,,,,0.004925,-0.009621,
2000-01-06,,,,-0.008333,0.012552,-0.445623,,,,,...,,,,,,,,-0.001378,0.017579,
2000-01-07,,,,0.016807,-0.008264,-0.076555,,,,,...,,,,,,,,0.005674,-0.007577,
2000-01-10,,,,-0.016529,0.012500,0.409326,,,,,...,,,,,,,,-0.001982,-0.006871,
2000-01-11,,,,0.029412,0.032922,-0.511029,,,,,...,,,,,,,,-0.000764,0.008303,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-31,0.000439,-0.000222,-0.134843,-0.010488,0.009792,0.114227,0.001090,0.002564,-0.373610,-0.012092,...,-0.357215,0.064073,0.048548,-0.033128,0.068702,-0.051418,-0.264140,-0.007368,-0.009626,
2021-01-01,-0.011840,0.007872,0.018136,0.008580,0.009279,0.461556,-0.006917,0.005572,0.146182,0.008227,...,-0.477668,0.042151,0.007441,0.106545,-0.049107,-0.004673,0.090282,-0.016967,-0.009719,
2021-01-04,0.010539,-0.021837,0.425291,0.004170,-0.010560,0.403789,0.009077,-0.021438,0.920665,0.006220,...,0.031850,0.028271,0.048728,0.325210,-0.000469,0.064789,0.711341,0.004315,0.000000,
2021-01-05,0.010429,-0.022325,0.298389,0.004153,-0.010673,0.287642,0.008995,-0.021907,0.479347,0.006181,...,0.030866,0.027493,0.046464,0.245402,-0.000470,0.060847,0.415663,0.004296,0.000000,


In [243]:
reglm = LinearRegression().fit(X,y)

In [244]:
rm = test_dta['SPY_Open']
rint = df['^TNX_Open'].loc[test_dta.index]
ri = test_dta['V_Open']
temp_test = pd.concat([ri, rm, rint], axis=1).dropna()

In [245]:
y_test = temp_test[temp_test.columns[0]].values
X_test = temp_test[temp_test.columns[1:]].values

In [246]:
reg.score(X,y)

0.752122173966991

In [247]:
reglm.score(X, y)

0.7521238460816104

In [248]:
reg.score(X_test, y_test)

0.2583277874678218

In [249]:
reglm.score(X_test, y_test)

0.2582859063212264