In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [3]:
df0 = pd.read_csv('Datasets/sp500sub.csv', index_col='Date', parse_dates=True)

In [4]:
df0.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Name
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04,9.22,9.51,9.17,9.5,9.5,1865400.0,INCY
2010-01-05,9.51,10.29,9.45,10.27,10.27,7608900.0,INCY
2010-01-06,10.38,11.09,10.35,11.0,11.0,8046700.0,INCY
2010-01-07,11.0,11.06,10.62,10.82,10.82,3680300.0,INCY
2010-01-08,10.82,11.0,10.75,10.94,10.94,1529300.0,INCY


In [5]:
df = df0[df0['Name'] == 'IBM'][['Close']].copy()

In [6]:
df['LogClose'] = np.log(df['Close'])

In [7]:
df['LogReturn'] = df['LogClose'].diff()

In [8]:
Ntest = 252
train = df.iloc[:-Ntest]
test = df.iloc[-Ntest:]

In [9]:
# Make supervised dataset
# let's see if we can use T past values to predict the next value

series = df['LogReturn'].to_numpy()[1:] # first value is nan
target = (series > 0) * 1

T = 21
X = []
Y = []
# count up to 1 less than before, since there is no target for last close
for t in range(len(series) - T):
  x = series[t:t+T]
  X.append(x)
  y = target[t+T]
  Y.append(y)

X = np.array(X).reshape(-1, T)
Y = np.array(Y)
N = len(X)
print("X.shape", X.shape, "Y.shape", Y.shape)

X.shape (2241, 21) Y.shape (2241,)


In [10]:
Xtrain, Ytrain = X[:-Ntest], Y[:-Ntest]
Xtest, Ytest = X[-Ntest:], Y[-Ntest:]

In [11]:
lr = LogisticRegression()
lr.fit(Xtrain, Ytrain)
lr.score(Xtrain, Ytrain)

0.5093011563599799

In [12]:
lr.score(Xtest, Ytest)

0.49603174603174605

In [13]:
svc = SVC()
svc.fit(Xtrain, Ytrain)
svc.score(Xtrain, Ytrain)

0.7551533433886375

In [14]:
svc.score(Xtest, Ytest)

0.49603174603174605

In [15]:
rf = RandomForestClassifier()
rf.fit(Xtrain, Ytrain)
rf.score(Xtrain, Ytrain)

1.0

In [16]:
rf.score(Xtest, Ytest)

0.48412698412698413

In [17]:
# Exercise: maybe you believe walk-forward validation will be
# more realistic - will it lead to better results?

In [None]:
# Exericise: do you think using raw (log) prices would have worked?