In [1]:
import matplotlib.pyplot as plt

import statsmodels.api as sm
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

from statsmodels.tsa.stattools import coint
import warnings
import json

from pandas.core.common import SettingWithCopyWarning

In [7]:
def data_preprocess(dta):
    dta['Date'] = pd.to_datetime(dta['Date'], format='%Y-%m-%d')
    dta = dta.set_index(dta['Date'])
    # NHLI not traded
    dta.drop(['Date', 'NHLI'], axis=1, inplace=True)
    dta.dropna(how='all', inplace=True)
    for tick in dta.columns:
        tick_series = dta[tick]
        start_pos = tick_series.first_valid_index()
        valid_series = tick_series.loc[start_pos:]
        if valid_series.isna().sum() > 0:
            dta.drop(tick, axis=1, inplace=True)

    for tick in dta.columns:
        dta[tick] = dta[tick].mask(dta[tick] == 0).ffill(downcast='infer')

    return dta[dta.index >= dta['SPY'].first_valid_index()]


def coint_group(tick, dta):
    """
    Use cointegration test and correlation to find predictive stocks for target
    :param tick: string for the target stock
    :param dta: the data file (csv) that contains the tick
    :return: a list of tickers that are in sp500 which predict the target
    """
    y = dta['%s_LAG' % tick]
    cointegrat = {}
    correlat = {}

    for i in dta.columns[:-2]:
        x = dta[i]
        score, pval, _ = coint(x, y, trend='ct')
        corr = x.corr(y)

        cointegrat[i] = pval
        correlat[i] = corr

    best_coint = sorted(cointegrat, key=cointegrat.get)[:50]
    best_corr = sorted(correlat, key=correlat.get, reverse=True)[:50]

    intersect = list(set(best_coint) & set(best_corr))
    if len(intersect) > 0:
        print("There are {} cointegrated stocks.".format(len(intersect)))
        return intersect
    else:
        print("Intersection is empty.")
        return best_coint[:10]

In [6]:
file_name = ['Regression_Prediction_%s.csv' % i for i in range(0,3)]

dta_list = []
for file in file_name:
    dta = pd.read_csv(file)
    dta_list.append(dta)

ttl = pd.concat(dta_list, axis=0)

ttl.to_csv("Prediction Result/temp2.csv")

In [8]:
filter1 = ttl[(ttl['NetProfit'] > 0) & (ttl['GrossProfit'] > 0) & (ttl['Var'] > 0) & (ttl['PredRet'] > 0)]
filter2 = filter1[(filter1.L1_MSE <= 0.02) & (filter1.L2_MSE <= 0.02) & (filter1.OLS_MSE <= 0.02)]
filter3 = filter2[filter2.NetProfit > filter2.GrossProfit]

filter3['Sharpe_2'] = filter3.GrossProfit / filter3.Var
filter3['SP_ttl'] = filter3.Sharpe + filter3.Sharpe_2
target_list = filter3.sort_values(['SP_ttl'], ascending=False).iloc[:10]['Unnamed: 0'].to_list()

In [10]:
filter3.sort_values(['SP_ttl'], ascending=False)

Unnamed: 0.1,Unnamed: 0,PredRet,NetProfit,GrossProfit,Var,Sharpe,L1_MSE,L2_MSE,OLS_MSE,Sharpe_2,SP_ttl
472,TSN,0.53568,0.296465,0.146985,0.00623,47.58759,0.018279,0.018117,0.018282,23.593648,71.181238
119,AIF,0.117434,0.097487,0.095202,0.007884,12.365339,0.014561,0.014382,0.014565,12.075527,24.440867
369,SCHL,0.396917,0.547324,0.118376,0.057413,9.533171,0.012685,0.016635,0.016716,2.061847,11.595018
437,UVV,0.072678,0.348324,0.284973,0.058316,5.973017,0.015261,0.015235,0.015261,4.886676,10.859693
90,OMC,0.24525,0.670001,0.389064,0.099979,6.701401,0.015215,0.014495,0.014487,3.891446,10.592847
362,OGS,0.064473,0.555328,0.394581,0.096168,5.77456,0.017626,0.017582,0.017626,4.103039,9.877599
204,DISCK,0.222351,0.541093,0.428254,0.113757,4.756569,0.018521,0.018701,0.018703,3.764641,8.521211
83,BRKL,0.646902,0.287537,0.200196,0.061134,4.703386,0.016249,0.016247,0.016249,3.274712,7.978098
471,ASB,0.550609,0.389051,0.208503,0.077075,5.047703,0.017005,0.018051,0.018083,2.705206,7.752909
184,BCOR,0.458672,0.485714,0.452679,0.131089,3.705231,0.019349,0.019591,0.019592,3.453221,7.158452


In [5]:
data = pd.read_csv('broader_stock.csv')
data = data_preprocess(data)

result = {}

In [6]:
for tick in target_list:
    original_series = data[tick]

    if tick in data.columns:
        original_data = pd.concat([data.drop([tick], axis=1), original_series], axis=1)
        original_data = original_data[original_data[tick].notnull()].dropna(axis=1)
    else:
        original_data = pd.concat([data, original_series], axis=1)
        original_data = original_data[original_data[tick].notnull()].dropna(axis=1)

    if original_data.index[-1] != data.index[-1]:
        continue

    original_data['%s_LAG' % tick] = original_data[tick].shift(-120)
    model_data = original_data.dropna()

    arr = model_data[tick]

    coint_corr = coint_group(tick, model_data)
    result[tick] = coint_corr

There are 6 cointegrated stocks.
There are 7 cointegrated stocks.
There are 15 cointegrated stocks.
There are 7 cointegrated stocks.
There are 17 cointegrated stocks.
There are 9 cointegrated stocks.
There are 6 cointegrated stocks.
There are 13 cointegrated stocks.
There are 9 cointegrated stocks.
There are 11 cointegrated stocks.


In [8]:
json_file = json.dumps(result)
f = open("Prediction Result/dict.json", "w")
f.write(json_file)
f.close()

# Actual Directional Prediction

In [22]:
test = {1:'a',2:'b',-1:'c'}

In [24]:
np.argmin(list(test.keys()))

2

In [21]:
list(test.keys())[0]

1

In [4]:
with open("Prediction Result/dict.json") as json_file:
    result = json.load(json_file)

In [8]:
data = pd.read_csv('broader_stock.csv')
data = data_preprocess(data)

alphas = np.linspace(0.001, 1000, 300)

In [9]:
ttl = pd.read_csv("Prediction Result/temp1.csv")
ttl.set_index(['Unnamed: 0.1'], inplace=True)

In [10]:
trade_prediction = {}

In [1]:
for key in result:
    model_type = np.argmin(ttl.loc[key][['L1_MSE', 'L2_MSE', 'OLS_MSE']])
    y = data[key].shift(-120).dropna().values
    n = y.shape[0]
    x = data[result[key]].iloc[-n-120:-120].values
    x_test = data[result[key]].iloc[-1].values
    if model_type == 0:
        model = LassoCV(alphas=alphas, max_iter=5000, fit_intercept=True, cv=10, n_jobs=-1).fit(x, y)
        pred = model.predict(x_test.reshape(1,-1))
    elif model_type == 1:
        model = RidgeCV(alphas=alphas, fit_intercept=True, cv=10).fit(x, y)
        pred = model.predict(x_test.reshape(1,-1))
    elif model_type == 2:
        model = sm.OLS(y, sm.add_constant(x)).fit()
        pred = model.predict(sm.add_constant(x_test))
    
    trade_prediction[key] = pred[0]

NameError: name 'result' is not defined

In [14]:
trade_prediction

{'BHE': 25.935497641926574,
 'MSTR': 178.064517178035,
 'FCF': 8.019808447394992,
 'KAMN': 52.28936159046932,
 'STBA': 26.434043804371317,
 'BMY': 67.38971115600006,
 'POST': 142.50049475081406,
 'VLY': 8.06837320105426,
 'SWX': 73.23632871815617,
 'AIV': 39.56509122636441}