In [60]:
import numpy as np
import pandas as pd

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

from vis_func import make_confusion_matrix

In [5]:
## Import Stock Market Data

## Import advertisting dataset
smarket = pd.read_csv('..\..\datasets\Smarket.csv', index_col='Unnamed: 0')
display(smarket)

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
1,2001,0.381,-0.192,-2.624,-1.055,5.010,1.19130,0.959,Up
2,2001,0.959,0.381,-0.192,-2.624,-1.055,1.29650,1.032,Up
3,2001,1.032,0.959,0.381,-0.192,-2.624,1.41120,-0.623,Down
4,2001,-0.623,1.032,0.959,0.381,-0.192,1.27600,0.614,Up
5,2001,0.614,-0.623,1.032,0.959,0.381,1.20570,0.213,Up
...,...,...,...,...,...,...,...,...,...
1246,2005,0.422,0.252,-0.024,-0.584,-0.285,1.88850,0.043,Up
1247,2005,0.043,0.422,0.252,-0.024,-0.584,1.28581,-0.955,Down
1248,2005,-0.955,0.043,0.422,0.252,-0.024,1.54047,0.130,Up
1249,2005,0.130,-0.955,0.043,0.422,0.252,1.42236,-0.298,Down


In [7]:
## Get Dtypes and Summary info

smarket.describe()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today
count,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0
mean,2003.016,0.003834,0.003919,0.001716,0.001636,0.00561,1.478305,0.003138
std,1.409018,1.136299,1.13628,1.138703,1.138774,1.14755,0.360357,1.136334
min,2001.0,-4.922,-4.922,-4.922,-4.922,-4.922,0.35607,-4.922
25%,2002.0,-0.6395,-0.6395,-0.64,-0.64,-0.64,1.2574,-0.6395
50%,2003.0,0.039,0.039,0.0385,0.0385,0.0385,1.42295,0.0385
75%,2004.0,0.59675,0.59675,0.59675,0.59675,0.597,1.641675,0.59675
max,2005.0,5.733,5.733,5.733,5.733,5.733,3.15247,5.733


In [9]:
smarket.corr(numeric_only=True)

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today
Year,1.0,0.0297,0.030596,0.033195,0.035689,0.029788,0.539006,0.030095
Lag1,0.0297,1.0,-0.026294,-0.010803,-0.002986,-0.005675,0.04091,-0.026155
Lag2,0.030596,-0.026294,1.0,-0.025897,-0.010854,-0.003558,-0.043383,-0.01025
Lag3,0.033195,-0.010803,-0.025897,1.0,-0.024051,-0.018808,-0.041824,-0.002448
Lag4,0.035689,-0.002986,-0.010854,-0.024051,1.0,-0.027084,-0.048414,-0.0069
Lag5,0.029788,-0.005675,-0.003558,-0.018808,-0.027084,1.0,-0.022002,-0.03486
Volume,0.539006,0.04091,-0.043383,-0.041824,-0.048414,-0.022002,1.0,0.014592
Today,0.030095,-0.026155,-0.01025,-0.002448,-0.0069,-0.03486,0.014592,1.0


## Logistic Regression

#### Will first use statsmodels to create the same output that is seen in the book

In [48]:
## Statsmodels requires you add your own constant
X = smarket.drop(columns=['Year', 'Direction', 'Today'])
X = sm.add_constant(X)

## Binarize the categorical response
y = smarket['Direction'].replace({'Up':1, 'Down': 0})

## Fit Logistic Regression model
log_reg = sm.Logit(y, X).fit()
print(log_reg.summary())

Optimization terminated successfully.
         Current function value: 0.691034
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:              Direction   No. Observations:                 1250
Model:                          Logit   Df Residuals:                     1243
Method:                           MLE   Df Model:                            6
Date:                Wed, 05 Apr 2023   Pseudo R-squ.:                0.002074
Time:                        16:56:32   Log-Likelihood:                -863.79
converged:                       True   LL-Null:                       -865.59
Covariance Type:            nonrobust   LLR p-value:                    0.7319
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.1260      0.241     -0.523      0.601      -0.598       0.346
Lag1          -0.0731      0.

#### Will use Sklearn for actual implementation

In [80]:
## Train/Test Split
## In the book, they split the data on date. While this isn't best practice, I've decided to follow along
## Will not use sklearn.train_test_split because of this

train = smarket.loc[smarket['Year'] < 2005]
test = smarket.loc[smarket['Year'] >= 2005]

X_train = train.drop(columns=['Year', 'Direction', 'Today'])
X_test = test.drop(columns=['Year', 'Direction', 'Today'])

y_train = train['Direction']
y_test = test['Direction']

clf = LogisticRegression(random_state=0).fit(X_train,y_train)
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

print(clf.coef_)

print(cm)

[[-0.05410202 -0.04559333  0.00727805  0.00653897 -0.00415829 -0.10995391]]
[[74 37]
 [93 48]]


In [72]:
## Rerun while removing all Lags >= 3

train = smarket.loc[smarket['Year'] < 2005]
test = smarket.loc[smarket['Year'] >= 2005]

X_train = train.drop(columns=['Year', 'Direction', 'Today', 'Lag3', 'Lag4', 'Lag5'])
X_test = test.drop(columns=['Year', 'Direction', 'Today', 'Lag3', 'Lag4', 'Lag5'])

y_train = train['Direction']
y_test = test['Direction']

clf = LogisticRegression(random_state=0).fit(X_train,y_train)
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[77 34]
 [97 44]]
