In [1]:
import numpy as np
import pandas as pd
import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC, SVC
data=pd.read_csv("spy_data.csv",index_col='Date')

In [2]:
def model_variables(prices,lags):
    prices=prices.apply(pd.to_numeric)
    inputs=pd.DataFrame(index=prices.index)
    
    inputs["Close"]=prices["Close"]
    inputs["Volume"]=prices["Volume"]
    for i in range(0,lags):
        tsret=pd.DataFrame(index=inputs.index)
        inputs["Lag%s" % str(i+1)]=prices["Close"].shift(i+1)
    tsret["VolumeChange"]=inputs["Volume"].pct_change()
    tsret["returns"]=inputs["Close"].pct_change()*100.0
    for i,x in enumerate(tsret["returns"]):
        if (abs(x)<0.0001):
            tsret["returns"][i]=0.0001
            
    for i in range(0,lags):
        tsret["Lag%s" % str(i+1)]=\
          inputs["Lag%s" % str(i+1)].pct_change()*100.0
        
    tsret=tsret.dropna()
    tsret["Direction"]=np.sign(tsret["returns"])
    
    tsret.index=pd.to_datetime(tsret.index)
    return tsret


In [3]:
variables_data=model_variables(data,2)
dataset=variables_data[["Lag1","Lag2","VolumeChange","Direction"]]
dataset=dataset.dropna()

X=dataset[["Lag1","Lag2","VolumeChange"]]
y=dataset["Direction"]

In [6]:
y.value_counts()

 1.0    640
-1.0    543
Name: Direction, dtype: int64

In [7]:
date_split=datetime.datetime(2019,1,1)

X_train=X[X.index<=date_split]
X_test=X[X.index>date_split]
y_train=y[y.index<=date_split]
y_test=y[y.index>date_split]

In [8]:
X_test.shape

(180, 3)

In [9]:
print("Hit Rates/Confusion Matrices:\n")
models=[("LR",LogisticRegression()),("LDA",LDA()), ("LSVC",LinearSVC()),("RSVM",SVC(C=1000000.0,cache_size=200,class_weight=None,coef0=0.0,degree=3,gamma=0.0001,kernel='rbf',max_iter=-1,probability=False,random_state=None,shrinking=True,tol=0.001,verbose=False)),("RF",RandomForestClassifier(n_estimators=1000,criterion='gini',max_depth=None, min_samples_split=2,min_samples_leaf=30,max_features='auto',bootstrap=True,oob_score=False,n_jobs=1,random_state=None,verbose=0))]
for m in models:
    m[1].fit(X_train,y_train)
    pred=m[1].predict(X_test)
    
    print("%s:\n%0.3f" % (m[0],m[1].score(X_test,y_test)))
    print("%s\n" % confusion_matrix(pred,y_test))

Hit Rates/Confusion Matrices:

LR:
0.583
[[28 29]
 [46 77]]

LDA:
0.589




[[28 28]
 [46 78]]

LSVC:
0.583
[[28 29]
 [46 77]]

RSVM:
0.572
[[20 23]
 [54 83]]

RF:
0.600
[[33 31]
 [41 75]]

