In [16]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [17]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier # Stochastic Gradient descent using a range of linear classifiers
from sklearn.svm import LinearSVC #Support Vector Classifier with linear kernal to scale better to larger sampel sizes
from sklearn.naive_bayes import GaussianNB #Gaussian Naive Bayes classifier
from sklearn.neighbors import KNeighborsClassifier #Classifier implementing the k-nearest neighbors vote

from sklearn.metrics import precision_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv("../input/snp500-max/GSPC latest snp.csv")
print(df.head(10))
print(df.info())

In [18]:
#a) Some cleaning to start with
df= df[df['Volume'] != 0]
df = df.drop('Adj Close', axis = 1)
#Now Some Feature Engineering 
#b)Dates
df['Date'] = pd.to_datetime(df['Date'])
df['Day'] = df['Date'].dt.day_name()
df['Month'] = df['Date'].dt.month_name()
df = df.drop('Date', axis = 1)
#c)Difference between close and 
#Moving Averages, Bollinger Bands (at 2 stdev), ranges and Donchian Channels(Highest High and Lowest Low)
for n in range(5,205,5):
    MA_name = "MA" + str(n)
    Range_name = "Range" + str(n)
    UpperBB_name = "UpperBB" + str(n)
    LowerBB_name = "LowerBB" + str(n)
    HH_name = "HH" + str(n)
    LL_name = "LL" + str(n)
    df[MA_name] = df['Close'].rolling(n).mean() - df['Close']
    df[UpperBB_name] = df[MA_name] + 2 * (df[MA_name].rolling(n).std())- df['Close']
    df[LowerBB_name] = df[MA_name] - 2 * (df[MA_name].rolling(n).std())- df['Close']
    df[Range_name] = df['High'].rolling(n).max() - df['Low'].rolling(n).min()- df['Close']
    df[HH_name] = df['High'].rolling(n).max()- df['Close']
    df[HH_name] = df['Low'].rolling(n).min()- df['Close']

In [19]:
#One Hot Encode ordinal catagorical variables
categorical_columns = ['Day', 'Month']
for column in categorical_columns:
    tempdf = pd.get_dummies(df[column], prefix=column)
    df = pd.merge(
        left=df,
        right=tempdf,
        left_index=True,
        right_index=True,
    )
    df = df.drop(columns=column)

In [20]:
# Drop Nan values, Create dependent variable, ie whether next bar is positive/Bull or negative/bear (1 =+ve, 0 = negative)
df= df.dropna()
df['Y'] = (df['Close'].shift(-1) - df['Open'].shift(-1)) > 0
#defragment
tempdf = df.copy()
df = tempdf
X = df.drop('Y', axis = 1)
y = df['Y']

In [21]:
#Split test and train data
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.25, random_state = 0)
#Scale dependant variables
sc = StandardScaler()
xtrain = sc.fit_transform(xtrain)
xtest = sc.transform(xtest)

In [None]:
#Fit all models
RF = RandomForestClassifier()
SGD = SGDClassifier()
SVM = LinearSVC(max_iter = 100000)
NB = GaussianNB()
KNN = KNeighborsClassifier()

RF.fit(xtrain, ytrain)
SGD.fit(xtrain, ytrain)
SVM.fit(xtrain, ytrain)
NB.fit(xtrain, ytrain)
KNN.fit(xtrain, ytrain)

In [None]:
#Test the models, and return metrics for each model
Acc = []
prec = []
models = ['Random Forest', 'Stochastic Gradient Descent', 'Support Vector Machine', 'Naive Bayes', 'K-Nearest Neighbor']
y_pred = RF.predict(xtest)
Acc.append = accuracy_score(ytest, y_pred)
prec.append = precision_score(ytest, y_pred)

y_pred = SGD.predict(xtest)
Acc.append = accuracy_score(ytest, y_pred)
prec.append = precision_score(ytest, y_pred)

y_pred = SVM.predict(xtest)
Acc.append = accuracy_score(ytest, y_pred)
prec.append = precision_score(ytest, y_pred)

y_pred = NB.predict(xtest)
Acc.append = accuracy_score(ytest, y_pred)
prec.append = precision_score(ytest, y_pred)

y_pred = KNN.predict(xtest)
Acc.append = accuracy_score(ytest, y_pred)
prec.append = precision_score(ytest, y_pred)

output = pd.DataFrame([models, Acc, prec], columns = ['Model', 'Accuracy', 'Precision'])