In [30]:
import numpy as np
import pandas as pd

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
## Import Stock Market Data

## Import advertisting dataset
smarket = pd.read_csv('..\..\datasets\Smarket.csv', index_col='Unnamed: 0')
display(smarket)

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
1,2001,0.381,-0.192,-2.624,-1.055,5.010,1.19130,0.959,Up
2,2001,0.959,0.381,-0.192,-2.624,-1.055,1.29650,1.032,Up
3,2001,1.032,0.959,0.381,-0.192,-2.624,1.41120,-0.623,Down
4,2001,-0.623,1.032,0.959,0.381,-0.192,1.27600,0.614,Up
5,2001,0.614,-0.623,1.032,0.959,0.381,1.20570,0.213,Up
...,...,...,...,...,...,...,...,...,...
1246,2005,0.422,0.252,-0.024,-0.584,-0.285,1.88850,0.043,Up
1247,2005,0.043,0.422,0.252,-0.024,-0.584,1.28581,-0.955,Down
1248,2005,-0.955,0.043,0.422,0.252,-0.024,1.54047,0.130,Up
1249,2005,0.130,-0.955,0.043,0.422,0.252,1.42236,-0.298,Down


In [7]:
## Rerun while removing all Lags >= 3
## k=1. High Variance

train = smarket.loc[smarket['Year'] < 2005]
test = smarket.loc[smarket['Year'] >= 2005]

X_train = train.drop(columns=['Year', 'Direction', 'Today', 'Lag3', 'Lag4', 'Lag5', 'Volume'])
X_test = test.drop(columns=['Year', 'Direction', 'Today', 'Lag3', 'Lag4', 'Lag5', 'Volume'])

y_train = train['Direction']
y_test = test['Direction']

clf = KNeighborsClassifier(n_neighbors=1).fit(X_train,y_train)
y_pred = clf.predict(X_test)

print('Feature Name Order:\n', clf.feature_names_in_, '\n')

cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', cm, '\n')

print(f'Accuracy:\n', accuracy_score(y_test,y_pred))

Feature Name Order:
 ['Lag1' 'Lag2'] 

Confusion Matrix:
 [[43 68]
 [58 83]] 

Accuracy:
 0.5


In [8]:
## Rerun while removing all Lags >= 3
## k=3. High Variance

train = smarket.loc[smarket['Year'] < 2005]
test = smarket.loc[smarket['Year'] >= 2005]

X_train = train.drop(columns=['Year', 'Direction', 'Today', 'Lag3', 'Lag4', 'Lag5', 'Volume'])
X_test = test.drop(columns=['Year', 'Direction', 'Today', 'Lag3', 'Lag4', 'Lag5', 'Volume'])

y_train = train['Direction']
y_test = test['Direction']

clf = KNeighborsClassifier(n_neighbors=3).fit(X_train,y_train)
y_pred = clf.predict(X_test)

print('Feature Name Order:\n', clf.feature_names_in_, '\n')

cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', cm, '\n')

print(f'Accuracy:\n', accuracy_score(y_test,y_pred))

Feature Name Order:
 ['Lag1' 'Lag2'] 

Confusion Matrix:
 [[48 63]
 [55 86]] 

Accuracy:
 0.5317460317460317


## We will now use Caravan Dataset

In [9]:
## Import Stock Market Data

## Import advertisting dataset
caravan = pd.read_csv('..\..\datasets\Caravan.csv', index_col='Unnamed: 0')
display(caravan)

Unnamed: 0,MOSTYPE,MAANTHUI,MGEMOMV,MGEMLEEF,MOSHOOFD,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,...,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND,Purchase
1,33,1,3,2,8,0,5,1,3,7,...,0,0,0,1,0,0,0,0,0,No
2,37,1,2,2,8,1,4,1,4,6,...,0,0,0,1,0,0,0,0,0,No
3,37,1,2,2,8,0,4,2,4,3,...,0,0,0,1,0,0,0,0,0,No
4,9,1,3,3,3,2,3,2,4,5,...,0,0,0,1,0,0,0,0,0,No
5,40,1,4,2,10,1,4,1,4,7,...,0,0,0,1,0,0,0,0,0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5818,36,1,1,2,8,0,6,1,2,1,...,0,0,0,1,0,0,0,0,0,No
5819,35,1,4,4,8,1,4,1,4,6,...,0,0,0,1,0,0,0,0,0,No
5820,33,1,3,4,8,0,6,0,3,5,...,0,0,0,1,0,0,0,0,0,Yes
5821,34,1,3,2,8,0,7,0,2,7,...,0,0,0,0,0,0,0,0,0,No


In [14]:
## Only 6% of people purchased caravan insurance
## Therefore, this data is highly imbalanaced
caravan.Purchase.value_counts()

No     5474
Yes     348
Name: Purchase, dtype: int64

In [31]:
## When performing KNN, it is important to standardize your variables
## k = 1 

X = caravan.drop(columns=['Purchase'])
y = caravan.Purchase

scaler = StandardScaler()
encoder = LabelEncoder()

X = scaler.fit_transform(X)
y = encoder.fit_transform(y)

## Following the book to get same results. Says first 1000 observations are training set
X_test = X[0:1000]
X_train = X[1001:]

y_test = y[0:1000]
y_train = y[1001:]

clf = KNeighborsClassifier(n_neighbors=1).fit(X_train,y_train)
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', cm, '\n')

print(f'Accuracy:\n', accuracy_score(y_test,y_pred), '\n')

print(f'Precision:\n', precision_score(y_test, y_pred))

Confusion Matrix:
 [[873  68]
 [ 50   9]] 

Accuracy:
 0.882 

Precision:
 0.11688311688311688


In [32]:
## When performing KNN, it is important to standardize your variables
## k = 1 

X = caravan.drop(columns=['Purchase'])
y = caravan.Purchase

scaler = StandardScaler()
encoder = LabelEncoder()

X = scaler.fit_transform(X)
y = encoder.fit_transform(y)

## Following the book to get same results. Says first 1000 observations are training set
X_test = X[0:1000]
X_train = X[1001:]

y_test = y[0:1000]
y_train = y[1001:]

clf = KNeighborsClassifier(n_neighbors=3).fit(X_train,y_train)
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', cm, '\n')

print(f'Accuracy:\n', accuracy_score(y_test,y_pred), '\n')

print(f'Precision:\n', precision_score(y_test, y_pred))

Confusion Matrix:
 [[921  20]
 [ 54   5]] 

Accuracy:
 0.926 

Precision:
 0.2


In [33]:
## When performing KNN, it is important to standardize your variables
## k = 5 

X = caravan.drop(columns=['Purchase'])
y = caravan.Purchase

scaler = StandardScaler()
encoder = LabelEncoder()

X = scaler.fit_transform(X)
y = encoder.fit_transform(y)

## Following the book to get same results. Says first 1000 observations are training set
X_test = X[0:1000]
X_train = X[1001:]

y_test = y[0:1000]
y_train = y[1001:]

clf = KNeighborsClassifier(n_neighbors=5).fit(X_train,y_train)
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', cm, '\n')

print(f'Accuracy:\n', accuracy_score(y_test,y_pred), '\n')

print(f'Precision:\n', precision_score(y_test, y_pred))

Confusion Matrix:
 [[930  11]
 [ 55   4]] 

Accuracy:
 0.934 

Precision:
 0.26666666666666666
