In [17]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS , summarize)

from ISLP import confusion_table
from ISLP.models import contrast
from sklearn.discriminant_analysis import \
(LinearDiscriminantAnalysis as LDA ,
QuadraticDiscriminantAnalysis as QDA)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [18]:
Smarket = load_data('Smarket')
Smarket

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,2001,0.381,-0.192,-2.624,-1.055,5.010,1.19130,0.959,Up
1,2001,0.959,0.381,-0.192,-2.624,-1.055,1.29650,1.032,Up
2,2001,1.032,0.959,0.381,-0.192,-2.624,1.41120,-0.623,Down
3,2001,-0.623,1.032,0.959,0.381,-0.192,1.27600,0.614,Up
4,2001,0.614,-0.623,1.032,0.959,0.381,1.20570,0.213,Up
...,...,...,...,...,...,...,...,...,...
1245,2005,0.422,0.252,-0.024,-0.584,-0.285,1.88850,0.043,Up
1246,2005,0.043,0.422,0.252,-0.024,-0.584,1.28581,-0.955,Down
1247,2005,-0.955,0.043,0.422,0.252,-0.024,1.54047,0.130,Up
1248,2005,0.130,-0.955,0.043,0.422,0.252,1.42236,-0.298,Down


In [19]:
allvars = Smarket.columns.drop(['Today', 'Direction', 'Year'])
design = MS(allvars)
X = design.fit_transform(Smarket)
Y = Smarket.Direction == 'Up'
# glm = sm.GLM(Y, X, family=sm.families.Binomial())
# results = glm.fit()
# summarize(results)
# results.params

In [20]:
train = (Smarket.Year < 2005)
Smarket_train = Smarket.loc[train]
Smarket_test = Smarket.loc[~train]
# Smarket_test.shape

X_train, X_test = X.loc[train], X.loc[~train]
Y_train, Y_test = Y.loc[train], Y.loc[~train]

glm_train = sm.GLM(
    Y_train,
    X_train,
    family=sm.families.Binomial()
)
results = glm_train.fit()
probs = results.predict(X_test)

In [21]:
D = Smarket.Direction
L_train, L_test = D.loc[train], D.loc[~train]

labels = np.array(["Down"] * 252)
labels[probs > 0.5] = "Up"
confusion_table(labels, L_test)

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,77,97
Up,34,44


In [22]:
"""
    Since all 5 vars have had underwhelming p-values, we could try and drop some of them to obtain a more effective model.

    Predictors that have no relationship with the response tend to deteriorate the test error rate since they cause an increase in variance without a corresponding decrease in bias.
"""

'\n    Since all 5 vars have had underwhelming p-values, we could try and drop some of them to obtain a more effective model.\n\n    Predictors that have no relationship with the response tend to deteriorate the test error rate since they cause an increase in variance without a corresponding decrease in bias.\n'

In [23]:
model = MS(['Lag1', 'Lag2']).fit(Smarket)
X = model.transform(Smarket)
X_train , X_test = X.loc[train], X.loc[~train]
glm_train = sm.GLM(Y_train , X_train , family=sm.families.Binomial ())
results = glm_train.fit()
probs = results.predict(exog=X_test)
labels = np.array (['Down ']*252)
labels[probs > 0.5] = 'Up'
confusion_table(labels , L_test)

Truth,Down,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Down,0,0,0
Down,35,0,35
Up,76,0,106


In [24]:
newdata = pd.DataFrame ({'Lag1':[1.2 , 1.5],
                        'Lag2':[1.1 , -0.8]});
newX = model.transform(newdata)
results.predict(newX)

0    0.479146
1    0.496094
dtype: float64

In [25]:
"""

LDA, ....

"""

'\n\nLDA, ....\n\n'

In [26]:
lda = LDA(store_covariance=True)
X_train, X_test = [M.drop(columns=['intercept']) for M in (X_train, X_test)]
lda.fit(X_train, L_train)
# lda.classes_
# lda.means_
# lda.priors_
# lda.scalings_
lda_pred = lda.predict(X_test)
confusion_table(lda_pred, L_test)

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,35,35
Up,76,106


In [27]:
lda_prob = lda.predict_proba(X_test)
# [:,1] is the second column, which corresponds to the "Up" class
np.all(
    np.where(lda_prob[:,1] >= 0.5, 'Up', 'Down') == lda_pred
)

np.True_

In [28]:
"""

QDA

"""

'\n\nQDA\n\n'

In [29]:
qda = QDA(store_covariance=True)
qda.fit(X_train, L_train)

qda_pred = qda.predict(X_test)
confusion_table(qda_pred, L_test)
np.mean(qda_pred == L_test)

np.float64(0.5992063492063492)

In [30]:
"""

Naive Bayes

"""

'\n\nNaive Bayes\n\n'

In [31]:
NB = GaussianNB()
NB.fit(X_train, L_train)
NB.theta_
NB.var_
X_train[L_train == "Down"].mean()
X_train[L_train == "Down"].var(ddof=0)
nb_labels = NB.predict(X_test)
confusion_table(nb_labels, L_test)
NB.predict_proba(X_test)[:5]

array([[0.4873288 , 0.5126712 ],
       [0.47623584, 0.52376416],
       [0.46529531, 0.53470469],
       [0.47484469, 0.52515531],
       [0.49020587, 0.50979413]])

In [32]:
"""

KNN

"""

'\n\nKNN\n\n'

In [34]:
knn1 = KNeighborsClassifier(n_neighbors=3)
knn1.fit(X_train, L_train)
knn1_pred = knn1.predict(X_test)
confusion_table(knn1_pred, L_test)

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,48,55
Up,63,86
