In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score, KFold, LeaveOneOut

In [3]:
data = pd.read_csv('data/Weekly.csv')
data['Direc_Up'] = pd.get_dummies(data['Direction'])['Up']
data

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction,Direc_Up
0,1990,0.816,1.572,-3.936,-0.229,-3.484,0.154976,-0.270,Down,0
1,1990,-0.270,0.816,1.572,-3.936,-0.229,0.148574,-2.576,Down,0
2,1990,-2.576,-0.270,0.816,1.572,-3.936,0.159837,3.514,Up,1
3,1990,3.514,-2.576,-0.270,0.816,1.572,0.161630,0.712,Up,1
4,1990,0.712,3.514,-2.576,-0.270,0.816,0.153728,1.178,Up,1
...,...,...,...,...,...,...,...,...,...,...
1084,2010,-0.861,0.043,-2.173,3.599,0.015,3.205160,2.969,Up,1
1085,2010,2.969,-0.861,0.043,-2.173,3.599,4.242568,1.281,Up,1
1086,2010,1.281,2.969,-0.861,0.043,-2.173,4.835082,0.283,Up,1
1087,2010,0.283,1.281,2.969,-0.861,0.043,4.454044,1.034,Up,1


In [4]:
# Perform Logistics Regression by smf
stats_model = smf.glm(formula='Direc_Up ~ Lag1 + Lag2', data=data, family=sm.families.Binomial()).fit()
print(stats_model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:               Direc_Up   No. Observations:                 1089
Model:                            GLM   Df Residuals:                     1086
Model Family:                Binomial   Df Model:                            2
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -744.11
Date:                Sun, 11 Oct 2020   Deviance:                       1488.2
Time:                        19:19:41   Pearson chi2:                 1.09e+03
No. Iterations:                     4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.2212      0.061      3.599      0.0

In [5]:
stats_model_pred = [1 if x > 0.5 else 0 for x in stats_model.predict()]
print('Training test error: %.4f' %(1-np.mean(stats_model_pred==data['Direc_Up'])))

Training test error: 0.4444


In [6]:
# Perform LOOCV using KFold for n_splits = n
crossvalidation = KFold(n_splits=1089, shuffle=False)
model = LogisticRegression()
scores = cross_val_score(model, data[['Lag1', 'Lag2']], data['Direc_Up'], cv=crossvalidation)
print('Test error: %.5f' %(1-np.mean(scores)))

Test error: 0.44995


In [7]:
# Perform LOOCV using for loops
result =[]
for i in range(1089):
    data_sample = data.drop([i])
    test_sample = data.iloc[i,:]
    lm = LogisticRegression().fit(data_sample[['Lag1', 'Lag2']], data_sample['Direc_Up'])
    lm_pred = lm.predict(test_sample[['Lag1', 'Lag2']].values.reshape(-1,2))
    error = 1 - np.mean(lm_pred==test_sample['Direc_Up'])
    result.append(error)
print('Test error: %.5f' %(np.mean(result))) 

Test error: 0.44995
