In [1]:
import fix_yahoo_finance as yf

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
from pandas_datareader import data as pdr

import fix_yahoo_finance as yf
yf.pdr_override() # <== that's all it takes :-)


# download dataframe
data = pdr.get_data_yahoo("^GSPC", start="2011-01-01", end="2018-01-01")


[*********************100%***********************]  1 of 1 downloaded


In [4]:
data=data.drop(['Adj Close','Volume'], axis=1)


In [5]:
# calculate momentum for each day
# 5-day momentum

def momentum(df):
    n = len(df)
    arr = []
    for i in range(0,5):
        arr.append('N')
    for j in range(5,n):
        momentum = df.Close[j] - df.Close[j-5] #Equation for momentum
        arr.append(momentum)
    return arr

momentum = momentum(data)

# add momentum to data
data['Momentum'] = momentum

In [7]:
#Use pct_change() function to add the one day returns to the dataframe 

data_pctchange=data.Close.pct_change()
data['Return'] = data_pctchange

In [9]:
#ROI function

def ROI(df,n):
    m = len(df)
    arr = []
    for i in range(0,n):
        arr.append('N')
    for j in range(n,m):
        roi= (df.Close[j] - df.Close[j-n])/df.Close[j-n] #Equation for ROI
        arr.append(roi)
    return arr

#Run the ROI function for 10, 20, and 30 day periods

ROI10=ROI(data,10)
ROI20=ROI(data,20)
ROI30=ROI(data,30)


#Add all 3 ROI results to dataframe 

data['10 Day ROI']=ROI10
data['20 Day ROI']=ROI20
data['30 Day ROI']=ROI30

In [11]:
# calculate RSI for each day


def RSI(df,period):
    # get average of upwards of last 14 days: Ct - Ct-1
    # get average of downwards of last 14 days: Ct-1 - Ct
    n = len(df)
    arr = []
    for i in range(0,period):
        arr.append('N')
    for j in range(period,n):
        total_upwards = 0
        total_downwards = 0
        # this will find average of upwards
        for k in range(j,j-period,-1):
            if(df.Close[k-1] > df.Close[k]):
                total_downwards = total_downwards + (df.Close[k-1] - df.Close[k])    
        avg_down = total_downwards / period
        for l in range(j,j-period,-1):
            if(df.Close[l] > df.Close[l-1]):
                total_upwards = total_upwards + (df.Close[l] - df.Close[l-1])
        avg_up = total_upwards / period
        RS = avg_up / avg_down
        RSI  = 100 - (100/(1+RS))
        arr.append(RSI)
    return arr


#Run RSI for 10, 14, and 30 day periods

RSI_14 = RSI(data,14)
RSI_10 = RSI(data,10)
RSI_30 = RSI(data,30)

# add RSI to data

data['10_day_RSI'] = RSI_10
data['14_day_RSI'] = RSI_14
data['30_day_RSI'] = RSI_30

In [12]:
# calculate EMA for each day
# formula: EMA = (2/(n+1))*ClosePrice + (1-(2/(n+1)))*previousEMA

def EMA(df, n):
    m = len(df)
    arr = []
    arr.append('N')
    prevEMA = df.Close[0]
    for i in range(1,m):
        close = df.Close[i]
        EMA = ((2/(n+1))*close) + ((1-(2/(n+1)))*prevEMA)
        arr.append(EMA)
        prevEMA = EMA
    return arr

#Calculate EMA with n=12 and n=26

EMA_12 = EMA(data, 12)
EMA_26 = EMA(data, 26)

#add EMA to dataframe 

data['EMA_12'] = EMA_12
data['EMA_26'] = EMA_26

In [14]:
#Function to Classify each day as a 1 or a 0

def clas(df):
    n = len(df)
    arr = []
    for i in range(0,len(df)-1):
        if (100*((df.Close[i+1]-df.Open[i+1])/df.Open[i+1]))>=.3:
            arr.append(1)
        else:
            arr.append(0)
    arr.append('N')
    return arr

clas=clas(data)

#Add Class to our dataframe
data['Class'] = clas

In [15]:
#MACD
# Moving Average of EMA(n) - EMA(m2) for each row
# where n = 12 and m2 = 26
def MACD(df):
    n = 12
    m2 = 26
    arr = []
    arr.append('N')
    ema_12 = EMA(df,n)
    ema_26 = EMA(df,m2)
    m = len(df)
    for i in range(1,m):
        arr.append(ema_12[i] - ema_26[i])
    return arr

MACD = MACD(data)

#Add MACD to our dataframe 
data['MACD_12_26'] = MACD

In [16]:
#SRSI: Stochastic RSI
#SRSI = (RSI_today - min(RSI_past_n)) / (max(RSI_past_n) - min(RSI_past_n))
def SRSI(df,n):
    m = len(df)
    arr = []
    list_RSI = RSI(df,n)
    for i in range(0,n):
        arr.append('N')
    for j in range(n,n+n):
        last_n = list_RSI[n:j]
        if(not(last_n == []) and not(max(last_n) == min(last_n))):
            SRSI = (list_RSI[j] - min(last_n)) / (max(last_n)- min(last_n))
            if SRSI > 1:
                arr.append(1)
            else:
                arr.append(SRSI)
        else:
            arr.append(0)
    for j in range(n+n,m):
        last_n = list_RSI[2*n:j]
        if(not(last_n == []) and not(max(last_n) == min(last_n))):
            SRSI = (list_RSI[j] - min(last_n)) / (max(last_n)- min(last_n))
            if SRSI > 1:
                arr.append(1)
            else:
                arr.append(SRSI)
        else:
            arr.append(0)
    return arr

#Run SRSI for 10, 14, and 30 day periods
SRSI_10 = SRSI(data,10)
SRSI_14 = SRSI(data,14)
SRSI_30 = SRSI(data,30)

#Add SRSI to our dataframe
data['SRSI_10'] = SRSI_10
data['SRSI_14'] = SRSI_14
data['SRSI_30'] = SRSI_30

In [17]:
# calculate Williams %R oscillator for each day

def Williams(df,n):
    m = len(df)
    arr = []
    for i in range(0,n-1):
        arr.append('N')
    for j in range(n-1,m):
        maximum = max(data.High[(j-n+1):j+1])
        minimum = min(data.Low[(j-n+1):j+1])
        val = (-100)*(maximum-df.Close[j])/(maximum-minimum)
        arr.append(val)
    return arr


williams = Williams(data,14)

#Add Williams%R to our dataframe
data['Williams'] = williams

In [18]:
# True Range
# TR = MAX(high[today] - close[yesterday]) - MIN(low[today] - close[yesterday])
def TR(df,n):
    high = df.High[n]
    low = df.Low[n]
    close = df.Close[n-1]
    l_max = list()
    l_max.append(high)
    l_max.append(close)
    l_min = list()
    l_min.append(low)
    l_min.append(close)
    return (max(l_max) - min(l_min))

# Average True Range
# Same as EMA except use TR in lieu of close (prevEMA = TR(dataframe,14days))
def ATR(df,n):
    m = len(df)
    arr = []
    prevEMA = TR(df,n+1)
    for i in range(0,n):
        arr.append('N')
    for j in range(n,m):
        TR_ = TR(df,j)
        EMA = ((2/(n+1))*TR_) + ((1-(2/(n+1)))*prevEMA)
        arr.append(EMA)
        prevEMA = EMA
    return arr

ATR = ATR(data,14)  

#Add ATR to our dataframe
data['ATR_14'] = ATR

In [19]:
# calculate Commodity Channel Index (CCI) for each day

import numpy as np
def CCI(df,n):
    m = len(df)
    arr = []
    tparr = []
    for i in range(0,n-1):
        arr.append('N')
        tp = (df.High[i]+df.Low[i]+df.Close[i])/3
        tparr.append(tp)
    for j in range(n-1,m):
        tp = (df.High[j]+df.Low[j]+df.Close[j])/3
        tparr.append(tp) 
        tps = np.array(tparr[(j-n+1):(j+1)])
        val = (tp-tps.mean())/(0.015*tps.std())
        arr.append(val)
    return arr

cci = CCI(data,20) 

#Add CCI to our dataframe
data['CCI'] = cci

In [48]:
#double check that the dataframe has all 22 features
data.shape

(1761, 22)

In [22]:
#def normalization function to clean data
def normalize(df):
    for column in df:
        df[column]=((df[column]-df[column].mean())/df[column].std())

In [23]:
#def positive values for running Multinomial Naive Bayes
def positivevalues(df):
    for column in df:
        if (df[column].min())<0:
            df[column]=(df[column]-df[column].min())

In [24]:
#Remove the first 30 index which could have a value 'N'
newdata=data.drop(data.index[0:30])

#Remove the last row of data because class has value 'N'
newdata=newdata.drop(newdata.index[-1])

#Remove 'High' and 'Low' columns to improve the algorithm
newdata=newdata.drop(['High','Low'], axis=1)

#Remove our 'Class' column because it acts as y in our algorithms 
newdata=newdata.drop(['Class'], axis=1)

#check the features that remain in our algorithm 
newdata.head()

Unnamed: 0_level_0,Open,Close,Momentum,Return,10 Day ROI,20 Day ROI,30 Day ROI,10_day_RSI,14_day_RSI,30_day_RSI,EMA_12,EMA_26,MACD_12_26,SRSI_10,SRSI_14,SRSI_30,Williams,ATR_14,CCI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2011-02-15,1330.430054,1328.01001,3.44006,-0.003235,0.0156165,0.0254745,0.0441397,73.4499,65.5501,65.9001,1317.11,1304.25,12.865,0.508845,-4.41442,0.0,-8.55505,10.5278,97.6909
2011-02-16,1329.51001,1336.319946,15.4399,0.006257,0.0247616,0.0424363,0.0520548,83.4336,67.3001,68.0478,1320.07,1306.62,13.4445,0.819604,0.379723,0.0,-2.06373,10.4041,110.107
2011-02-17,1334.369995,1340.430054,18.5601,0.003076,0.0254993,0.0469983,0.0500329,83.7829,86.7448,67.6505,1323.2,1309.13,14.0731,0.830477,1.0,0.814993,-1.64607,10.4169,110.195
2011-02-18,1340.380005,1343.01001,13.86,0.001925,0.0245181,0.0464877,0.0542921,83.382,85.552,69.1261,1326.25,1311.64,14.611,0.817998,0.94372,1.0,-1.92961,9.82131,111.64
2011-02-22,1338.910034,1315.439941,-16.88,-0.020529,-0.0027369,0.0190573,0.0345576,47.327,54.5581,60.664,1324.58,1311.92,12.6667,-0.304278,-0.518618,-1.62306,-58.1438,12.6025,34.2949


In [35]:
#Normalize the data that we have filtered
normalize(newdata)



In [37]:
#Put the dataframe with our relevant features into X and our class into our y
X=newdata
y=clas[30:-1]


Unnamed: 0_level_0,Open,Close,Momentum,Return,10 Day ROI,20 Day ROI,30 Day ROI,10_day_RSI,14_day_RSI,30_day_RSI,EMA_12,EMA_26,MACD_12_26,SRSI_10,SRSI_14,SRSI_30,Williams,ATR_14,CCI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017-12-21,2.125781,2.127841,0.909635,0.171395,0.558415,0.759923,0.564631,1.01168,0.770048,1.15718,2.10277,2.068,1.44315,0.928754,0.595416,0.885422,0.694538,-0.29385,0.810284
2017-12-22,2.12881,2.124737,0.114527,-0.100672,0.311088,0.680604,0.6544,0.71149,0.829642,1.38103,2.10982,2.07601,1.41581,0.652097,0.640877,1.04987,0.637315,-0.469877,0.6176
2017-12-26,2.11586,2.117569,-0.431532,-0.167497,0.135068,0.659338,0.65003,0.385107,1.12185,1.36695,2.11468,2.08289,1.35615,0.351301,0.863785,1.03953,0.505185,-0.659017,0.514219
2017-12-27,2.123459,2.12292,-0.0881379,0.03839,0.103834,0.376702,0.644802,0.331589,1.16723,1.36198,2.11962,2.08965,1.30216,0.301979,0.898405,1.03587,0.588876,-0.796101,0.515302
2017-12-28,2.133556,2.135336,0.138669,0.154512,0.199019,0.445532,0.757562,0.515906,1.12479,1.58274,2.12572,2.09685,1.27037,0.471846,0.866032,1.19805,0.701225,-0.948511,0.569311


In [38]:
#Split up our test and train by splitting 70%/30%

X_train=X.drop(X.index[1211:])
X_test=X.drop(X.index[0:1211])
y_train=y[0:1211]
y_test=y[1211:]

Unnamed: 0_level_0,Open,Close,Momentum,Return,10 Day ROI,20 Day ROI,30 Day ROI,10_day_RSI,14_day_RSI,30_day_RSI,EMA_12,EMA_26,MACD_12_26,SRSI_10,SRSI_14,SRSI_30,Williams,ATR_14,CCI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017-12-21,2.125781,2.127841,0.909635,0.171395,0.558415,0.759923,0.564631,1.01168,0.770048,1.15718,2.10277,2.068,1.44315,0.928754,0.595416,0.885422,0.694538,-0.29385,0.810284
2017-12-22,2.12881,2.124737,0.114527,-0.100672,0.311088,0.680604,0.6544,0.71149,0.829642,1.38103,2.10982,2.07601,1.41581,0.652097,0.640877,1.04987,0.637315,-0.469877,0.6176
2017-12-26,2.11586,2.117569,-0.431532,-0.167497,0.135068,0.659338,0.65003,0.385107,1.12185,1.36695,2.11468,2.08289,1.35615,0.351301,0.863785,1.03953,0.505185,-0.659017,0.514219
2017-12-27,2.123459,2.12292,-0.0881379,0.03839,0.103834,0.376702,0.644802,0.331589,1.16723,1.36198,2.11962,2.08965,1.30216,0.301979,0.898405,1.03587,0.588876,-0.796101,0.515302
2017-12-28,2.133556,2.135336,0.138669,0.154512,0.199019,0.445532,0.757562,0.515906,1.12479,1.58274,2.12572,2.09685,1.27037,0.471846,0.866032,1.19805,0.701225,-0.948511,0.569311


In [39]:
#Import and run Logistic Regression and run a fit to train the model
from sklearn.linear_model import LogisticRegression

LR=LogisticRegression()
LR.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [40]:
#Predict the y test 
y_pred_LR=LR.predict(X_test)

In [41]:
#Print the accuracy score of our predicted y using metrics from sklearn
from sklearn import metrics
print (metrics.accuracy_score(y_test, y_pred_LR)) 

0.7533718689788054


In [43]:
#Import and run Gaussian Naive Bayes and run a fit to train the model
from sklearn.naive_bayes import GaussianNB

GNB = GaussianNB()
GNB.fit(X_train,y_train)

GaussianNB(priors=None)

In [44]:
#Predict the y test
y_pred=GNB.predict(X_test)

In [45]:
#Print the accuracy score of our predicted y using metrics from sklearn
from sklearn import metrics
print (metrics.accuracy_score(y_test, y_pred)) 

0.7263969171483622
