In [3]:
import yfinance as yf
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import coint
import statsmodels.api as sm
import pandas as pd
import numpy as np
import xgboost as xgb

In [4]:
START = datetime(2000, 1, 1)
END = datetime(2021, 1, 8)
tqqq = yf.Ticker("TQQQ")
hist_tqqq = tqqq.history(start=START, end=END)

In [5]:
START = datetime(2000, 1, 1)
END = datetime(2021, 1, 8)
uvxy = yf.Ticker("UVXY")
hist_uvxy = uvxy.history(start=START, end=END)

In [6]:
dta = pd.read_csv('broader_stock.csv')

In [7]:
def data_preprocess(dta):
    dta['Date'] = pd.to_datetime(dta['Date'], format='%Y-%m-%d')
    dta = dta.set_index(dta['Date'])
    # NHLI not traded
    dta.drop(['Date', 'NHLI'], axis=1, inplace=True)
    dta.dropna(how='all', inplace=True)
    for tick in dta.columns:
        tick_series = dta[tick]
        start_pos = tick_series.first_valid_index()
        valid_series = tick_series.loc[start_pos:]
        if valid_series.isna().sum() > 0:
            dta.drop(tick, axis=1, inplace=True)

    for tick in dta.columns:
        dta[tick] = dta[tick].mask(dta[tick] == 0).ffill(downcast='infer')

    return dta[dta.index >= dta['SPY'].first_valid_index()]

In [8]:
dta = data_preprocess(dta)

In [9]:
pct_tqqq = hist_tqqq.pct_change()
pct_uvxy = hist_uvxy.pct_change()
pct_dta = dta.pct_change()

### Specify subject of interest

In [375]:
temp_dta = pd.concat([pct_tqqq.Close, pct_dta], axis=1)
temp_dta = temp_dta[temp_dta['Close'].notnull()]

In [376]:
temp_dta['Close_LAG'] = temp_dta['Close'].shift(-5)
temp_dta.dropna(inplace=True)

In [124]:
cointegrat = {}
correlat = {}

In [125]:
for col in temp_dta.columns[:-1]:
    x = temp_dta[col]
    score, pval, _ = coint(x, temp_dta['Close_LAG'], autolag='t-stat')
    corr = abs(x.corr(temp_dta['Close_LAG']))
    cointegrat[col] = pval
    correlat[col] = corr

In [126]:
best_coint = sorted(cointegrat, key=cointegrat.get)[:50]
best_corr = sorted(correlat, key=correlat.get, reverse=True)[:50]

In [127]:
intersect = list(set(best_coint) & set(best_corr))
union_X = list(set(best_coint) | set(best_corr))

# Testing XGB Regressor

In [442]:
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score

In [443]:
X = temp_dta[union_X].values
y = temp_dta['Close_LAG'].values

In [444]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [445]:
params = {
    # Parameters that we are going to tune.
    'max_depth':3,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
    'eval_metric': 'rmse'
}

In [446]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-rmse:0.350954
Will train until Test-rmse hasn't improved in 10 rounds.
[1]	Test-rmse:0.247838
[2]	Test-rmse:0.176371
[3]	Test-rmse:0.128233
[4]	Test-rmse:0.09637
[5]	Test-rmse:0.075991
[6]	Test-rmse:0.064069
[7]	Test-rmse:0.05724
[8]	Test-rmse:0.053927
[9]	Test-rmse:0.051903
[10]	Test-rmse:0.051572
[11]	Test-rmse:0.050648
[12]	Test-rmse:0.050874
[13]	Test-rmse:0.050885
[14]	Test-rmse:0.050639
[15]	Test-rmse:0.050347
[16]	Test-rmse:0.050037
[17]	Test-rmse:0.049864
[18]	Test-rmse:0.04968
[19]	Test-rmse:0.049569
[20]	Test-rmse:0.049611
[21]	Test-rmse:0.049798
[22]	Test-rmse:0.04975
[23]	Test-rmse:0.049393
[24]	Test-rmse:0.049462
[25]	Test-rmse:0.049481
[26]	Test-rmse:0.049526
[27]	Test-rmse:0.049436
[28]	Test-rmse:0.049559
[29]	Test-rmse:0.049697
[30]	Test-rmse:0.049632
[31]	Test-rmse:0.049449
[32]	Test-rmse:0.04939
[33]	Test-rmse:0.049275
[34]	Test-rmse:0.049418
[35]	Test-rmse:0.049424
[36]	Test-rmse:0.049489
[37]	Test-rmse:0.049515
[38]	Test-rmse:0.050007
[39]	Test-rmse:0.04996

In [447]:
is_pred = model.predict(dtrain)
r2_score(y_train, is_pred)

0.9397318756405516

In [448]:
y_pred = model.predict(dtest)
r2_score(y_test, y_pred)

-0.03227657181892862

# Testing XGB Classifier

In [459]:
X = temp_dta[union_X].values
y = temp_dta['Close_LAG'].values
y = (y > 0).astype(int)

In [460]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42, stratify=y)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [461]:
params = {
    # Parameters that we are going to tune.
    'max_depth':2,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'binary:logistic',
    'eval_metric': 'auc'
}

In [462]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-auc:0.470323
Will train until Test-auc hasn't improved in 10 rounds.
[1]	Test-auc:0.496703
[2]	Test-auc:0.509719
[3]	Test-auc:0.490802
[4]	Test-auc:0.511801
[5]	Test-auc:0.560569
[6]	Test-auc:0.552586
[7]	Test-auc:0.544776
[8]	Test-auc:0.593891
[9]	Test-auc:0.57428
[10]	Test-auc:0.586602
[11]	Test-auc:0.578619
[12]	Test-auc:0.570288
[13]	Test-auc:0.577751
[14]	Test-auc:0.579139
[15]	Test-auc:0.591982
[16]	Test-auc:0.607254
[17]	Test-auc:0.609684
[18]	Test-auc:0.597536
[19]	Test-auc:0.61194
[20]	Test-auc:0.590073
[21]	Test-auc:0.584519
[22]	Test-auc:0.583131
[23]	Test-auc:0.57723
[24]	Test-auc:0.572024
[25]	Test-auc:0.595627
[26]	Test-auc:0.596668
[27]	Test-auc:0.600833
[28]	Test-auc:0.605692
[29]	Test-auc:0.608122
Stopping. Best iteration:
[19]	Test-auc:0.61194



In [463]:
is_pred = model.predict(dtrain)
is_pred = (is_pred > 0.5).astype(int)
accuracy_score(y_train, is_pred)

0.9151376146788991

In [464]:
y_pred = model.predict(dtest)
y_pred = (y_pred > 0.5).astype(int)
accuracy_score(y_test, y_pred)

0.6

# Measure P&L

In [467]:
ttl_dtest = xgb.DMatrix(X, np.zeros(X.shape[0]))
ttl_pred = model.predict(ttl_dtest)
ttl_pred = (ttl_pred > 0.5).astype(int)

In [468]:
inventory = 0
asset = 0
record = [asset]

In [469]:
for i, dt in enumerate(temp_dta[union_X].index):
    price = hist_tqqq.loc[dt]['Close']
    trend_good = ttl_pred[i] == 1
    if trend_good and inventory == 0 and i != len(ttl_pred) - 1:
        # buy
        asset -= price
        inventory += 1
    elif not trend_good and inventory == 1:
        # sell
        asset += price
        inventory -= 1
    elif i == len(ttl_pred) - 1 and inventory == 1:
        # liquidate in the end
        print('lit', i)
        asset += price
        inventory -= 1
    else:
        asset = record[-1]
    record.append(asset)



In [470]:
sub_hist = hist_tqqq.loc[temp_dta[union_X].index]

In [471]:
ttl_ret = asset / sub_hist.iloc[0].Close
net_ret = (asset - sub_hist.iloc[-1].Close + sub_hist.iloc[0].Close) / sub_hist.iloc[0].Close
pct_record = np.array(record[1:]) / np.array(sub_hist.Close)
sharpe = net_ret / np.var(pct_record) + 1e-10

In [472]:
ttl_ret

2.2714607662364252

In [473]:
net_ret

-0.4242982995287864