In [23]:
#imports
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
import statsmodels.formula.api as smf
%matplotlib inline

from sklearn.metrics import confusion_matrix, classification_report



dataset = pd.read_csv("Smarket.csv")
dataset.head()

Unnamed: 0.1,Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,1,2001,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
1,2,2001,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
2,3,2001,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
3,4,2001,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
4,5,2001,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


In [11]:
formula = "Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume"
model = smf.glm(formula=formula, data=dataset, family=sm.families.Binomial())
result = model.fit()
print(result.summary())

                          Generalized Linear Model Regression Results                           
Dep. Variable:     ['Direction[Down]', 'Direction[Up]']   No. Observations:                 1250
Model:                                              GLM   Df Residuals:                     1243
Model Family:                                  Binomial   Df Model:                            6
Link Function:                                    Logit   Scale:                          1.0000
Method:                                            IRLS   Log-Likelihood:                -863.79
Date:                                  Wed, 11 Oct 2023   Deviance:                       1727.6
Time:                                          14:17:34   Pearson chi2:                 1.25e+03
No. Iterations:                                       4   Pseudo R-squ. (CS):           0.002868
Covariance Type:                              nonrobust                                         
                 coef    std e

In [16]:
print(f"Coefficient:\n{result.params}")
print(f"p-values:\n{result.pvalues}")
print(f"Dependant variables:\n{result.model.endog_names}")

Coefficient:
Intercept    0.126000
Lag1         0.073074
Lag2         0.042301
Lag3        -0.011085
Lag4        -0.009359
Lag5        -0.010313
Volume      -0.135441
dtype: float64
p-values:
Intercept    0.600700
Lag1         0.145232
Lag2         0.398352
Lag3         0.824334
Lag4         0.851445
Lag5         0.834998
Volume       0.392404
dtype: float64
Dependant variables:
['Direction[Down]', 'Direction[Up]']


In [17]:
predictions = result.predict()
print(predictions[:10])

[0.49291587 0.51853212 0.51886117 0.48477764 0.48921884 0.49304354
 0.50734913 0.49077084 0.48238647 0.51116222]


In [19]:
print(np.column_stack((dataset['Direction'].to_numpy().flatten(), result.model.endog)))

[['Up' 0.0]
 ['Up' 0.0]
 ['Down' 1.0]
 ...
 ['Up' 0.0]
 ['Down' 1.0]
 ['Down' 1.0]]


In [21]:
predictions_nominal = ["Up" if x < 0.5 else "Down" for x in predictions]
# predictions_nominal

In [28]:
print(confusion_matrix(dataset['Direction'], predictions_nominal))

[[145 457]
 [141 507]]


In [27]:
print(classification_report(dataset['Direction'], predictions_nominal, digits=3))

              precision    recall  f1-score   support

        Down      0.507     0.241     0.327       602
          Up      0.526     0.782     0.629       648

    accuracy                          0.522      1250
   macro avg      0.516     0.512     0.478      1250
weighted avg      0.517     0.522     0.483      1250

