In [79]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
%matplotlib inline

In [13]:
# Read data
smarket = pd.read_csv('Data/Smarket.csv', usecols = range(1,10))

# Convert direction to dummy variables 'Up' and 'Down' (binary)
dummy_directions = pd.get_dummies(smarket['Direction'])

# Remove original 'Direction' column and 'Down' dummy variable to avoid the >>dummy variable trap<<
smarket = smarket.ix[:,: 'Today'].join(dummy_directions['Up'])

# Add intercept
smarket['Intercept'] = 1.0

# Verify dataframe
smarket.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Up,Intercept
0,2001,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,1,1.0
1,2001,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,1,1.0
2,2001,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,0,1.0
3,2001,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,1,1.0
4,2001,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,1,1.0


In [44]:
# We now perform logistic regression of 'Up' onto the 'Lag's and 'Volume'
log_reg = sm.Logit(smarket.Up, smarket.drop(['Up','Year','Today'], axis=1))
fit_model = log_reg.fit()
print(fit_model.summary())

Optimization terminated successfully.
         Current function value: 2.197001
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:                     Up   No. Observations:                 1250
Model:                          Logit   Df Residuals:                     1243
Method:                           MLE   Df Model:                            6
Date:                Wed, 26 Apr 2017   Pseudo R-squ.:                  -5.451
Time:                        17:46:47   Log-Likelihood:                -2746.3
converged:                       True   LL-Null:                       -425.74
                                        LLR p-value:                     1.000
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Lag1          -0.0731      0.050     -1.457      0.145        -0.171     0.025
Lag2          -0.0423      0.

In [83]:
# We calculate the confusion matrix
confusion_data = fit_model.pred_table(threshold=0.5).T

confusion_matrix = pd.DataFrame(confusion_data, columns=['Down', 'Up'], index = ['Down', 'Up'], dtype=int)
print("Confusion matrix:\n", confusion_matrix, "\n")
# The diagonal elements are the number of correct predictions
# The off-diagonal elements are the number of wrong predictions

correct = sum(np.diagonal(confusion_data))
ratio = correct/np.sum(confusion_data)
print("The fraction of correct guesses is: %0.2f" % ratio)
print("The error rate is: %0.2f" % (1-ratio))

Confusion matrix:
       Down   Up
Down   145  141
Up     457  507 

The fraction of correct guesses is: 0.52
The error rate is: 0.48
