In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from prepare import titanic_pipeline

<h1 class="alert alert-block alert-success" style="font-size: 20px;"> Methodologies I > Classification > Modeling > KNN</h1>  

<h2 class="alert alert-block alert-info" style="font-size: 20px;">1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)


In [6]:
#Lets use the titanic pipeline to acquire and prepare the data
train, val, test = titanic_pipeline()
#Double check our results
print(train.shape, val.shape, test.shape)
train.head()


(623, 9) (134, 9) (134, 9)


Unnamed: 0,survived,sex,age,sibsp,parch,fare,class,embark_town,alone
748,0,male,19.0,1,0,53.1,First,Southampton,0
45,0,male,29.0,0,0,8.05,Third,Southampton,1
28,1,female,29.0,0,0,7.8792,Third,Queenstown,1
633,0,male,29.0,0,0,0.0,First,Southampton,1
403,0,male,28.0,1,0,15.85,Third,Southampton,0


In [7]:
#Create X_train (remove the target variable)
X_train = train.drop(columns = ['survived'])
#Create Y_train (only the target variable)
y_train = train.survived

#Create X_val (remove the target variable)
X_val = val.drop(columns = ['survived'])
#Create Y_val (only the target variable)
y_val = val.survived

In [8]:
#Create the dummy columns
X_train = pd.get_dummies(X_train)
X_val = pd.get_dummies(X_val)
#check results
X_train.head()

Unnamed: 0,age,sibsp,parch,fare,alone,sex_female,sex_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
748,19.0,1,0,53.1,0,False,True,True,False,False,False,False,True
45,29.0,0,0,8.05,1,False,True,False,False,True,False,False,True
28,29.0,0,0,7.8792,1,True,False,False,False,True,False,True,False
633,29.0,0,0,0.0,1,False,True,True,False,False,False,False,True
403,28.0,1,0,15.85,0,False,True,False,False,True,False,False,True


In [9]:
knn = KNeighborsClassifier()

knn.fit(X_train, y_train)

<h2 class="alert alert-block alert-info" style="font-size: 20px;">2. Evaluate your results using the model score, confusion matrix, and classification report.


In [11]:
train_preds = knn.predict(X_train)
train_preds[:10]

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [10]:
knn.score(X_train, y_train), knn.score(X_val, y_val)

(0.8089887640449438, 0.7014925373134329)

In [12]:
#Classification report
print(classification_report(y_train, train_preds))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       384
           1       0.78      0.70      0.74       239

    accuracy                           0.81       623
   macro avg       0.80      0.79      0.79       623
weighted avg       0.81      0.81      0.81       623



In [13]:
#Create the confusion matrix from the y_train (catual target variable) and the train predictions
conf_matrix = confusion_matrix(y_train, train_preds)
print(conf_matrix)
#assign a variable to each outcome to be used on later calculations
tn, fp, fn, tp = conf_matrix.ravel()
print(tn, fp, fn, tp)

[[337  47]
 [ 72 167]]
337 47 72 167


<h2 class="alert alert-block alert-info" style="font-size: 20px;">3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.


In [14]:
#Calculate tpr, fpr, tnr,fnr
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)
tnr = tn / (tn + fp)
fnr = fn / (fn + tp)
#display the rates
print(f"True Positive Rate    {tpr:.4f}")
print(f"False Positive Rate   {fpr:.4f}")
print(f"True Negative Rate    {tnr:.4f}")
print(f"False Negative Rate   {fnr:.4f}")

True Positive Rate    0.6987
False Positive Rate   0.1224
True Negative Rate    0.8776
False Negative Rate   0.3013


<h2 class="alert alert-block alert-info" style="font-size: 20px;">4. Run through steps 1-3 setting k to 10


In [15]:
train, val, test = titanic_pipeline()

In [16]:
#Create X_train (remove the target variable)
X_train = train.drop(columns = ['survived'])
#Create Y_train (only the target variable)
y_train = train.survived

#Create X_val (remove the target variable)
X_val = val.drop(columns = ['survived'])
#Create Y_val (only the target variable)
y_val = val.survived

In [17]:
#Create the dummy columns
X_train = pd.get_dummies(X_train)
X_val = pd.get_dummies(X_val)
#check results
X_train.head()

Unnamed: 0,age,sibsp,parch,fare,alone,sex_female,sex_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
748,19.0,1,0,53.1,0,False,True,True,False,False,False,False,True
45,29.0,0,0,8.05,1,False,True,False,False,True,False,False,True
28,29.0,0,0,7.8792,1,True,False,False,False,True,False,True,False
633,29.0,0,0,0.0,1,False,True,True,False,False,False,False,True
403,28.0,1,0,15.85,0,False,True,False,False,True,False,False,True


In [18]:
knn = KNeighborsClassifier(n_neighbors= 10)

knn.fit(X_train, y_train)

In [19]:
train_preds = knn.predict(X_train)
train_preds[:10]

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [20]:
knn.score(X_train, y_train), knn.score(X_val, y_val)

(0.7592295345104334, 0.7238805970149254)

In [21]:
#Classification report
print(classification_report(y_train, train_preds))

              precision    recall  f1-score   support

           0       0.76      0.88      0.82       384
           1       0.75      0.56      0.64       239

    accuracy                           0.76       623
   macro avg       0.76      0.72      0.73       623
weighted avg       0.76      0.76      0.75       623



In [22]:
#Create the confusion matrix from the y_train (catual target variable) and the train predictions
conf_matrix = confusion_matrix(y_train, train_preds)
print(conf_matrix)
#assign a variable to each outcome to be used on later calculations
tn, fp, fn, tp = conf_matrix.ravel()
print(tn, fp, fn, tp)

[[339  45]
 [105 134]]
339 45 105 134


In [23]:
#Calculate tpr, fpr, tnr,fnr
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)
tnr = tn / (tn + fp)
fnr = fn / (fn + tp)
#display the rates
print(f"True Positive Rate    {tpr:.4f}")
print(f"False Positive Rate   {fpr:.4f}")
print(f"True Negative Rate    {tnr:.4f}")
print(f"False Negative Rate   {fnr:.4f}")

True Positive Rate    0.5607
False Positive Rate   0.1172
True Negative Rate    0.8828
False Negative Rate   0.4393


<h2 class="alert alert-block alert-info" style="font-size: 20px;">5. Run through steps 1-3 setting k to 20


In [24]:
train, val, test = titanic_pipeline()

In [25]:
#Create X_train (remove the target variable)
X_train = train.drop(columns = ['survived'])
#Create Y_train (only the target variable)
y_train = train.survived

#Create X_val (remove the target variable)
X_val = val.drop(columns = ['survived'])
#Create Y_val (only the target variable)
y_val = val.survived

In [26]:
#Create the dummy columns
X_train = pd.get_dummies(X_train)
X_val = pd.get_dummies(X_val)
#check results
X_train.head()

Unnamed: 0,age,sibsp,parch,fare,alone,sex_female,sex_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
748,19.0,1,0,53.1,0,False,True,True,False,False,False,False,True
45,29.0,0,0,8.05,1,False,True,False,False,True,False,False,True
28,29.0,0,0,7.8792,1,True,False,False,False,True,False,True,False
633,29.0,0,0,0.0,1,False,True,True,False,False,False,False,True
403,28.0,1,0,15.85,0,False,True,False,False,True,False,False,True


In [27]:
knn = KNeighborsClassifier(n_neighbors= 20)

knn.fit(X_train, y_train)

In [28]:
train_preds = knn.predict(X_train)
train_preds[:10]

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [29]:
knn.score(X_train, y_train), knn.score(X_val, y_val)

(0.7383627608346709, 0.746268656716418)

In [30]:
#Classification report
print(classification_report(y_train, train_preds))

              precision    recall  f1-score   support

           0       0.73      0.90      0.81       384
           1       0.75      0.48      0.58       239

    accuracy                           0.74       623
   macro avg       0.74      0.69      0.70       623
weighted avg       0.74      0.74      0.72       623



In [31]:
#Create the confusion matrix from the y_train (catual target variable) and the train predictions
conf_matrix = confusion_matrix(y_train, train_preds)
print(conf_matrix)
#assign a variable to each outcome to be used on later calculations
tn, fp, fn, tp = conf_matrix.ravel()
print(tn, fp, fn, tp)

[[346  38]
 [125 114]]
346 38 125 114


In [32]:
#Calculate tpr, fpr, tnr,fnr
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)
tnr = tn / (tn + fp)
fnr = fn / (fn + tp)
#display the rates
print(f"True Positive Rate    {tpr:.4f}")
print(f"False Positive Rate   {fpr:.4f}")
print(f"True Negative Rate    {tnr:.4f}")
print(f"False Negative Rate   {fnr:.4f}")

True Positive Rate    0.4770
False Positive Rate   0.0990
True Negative Rate    0.9010
False Negative Rate   0.5230


<h2 class="alert alert-block alert-info" style="font-size: 20px;">6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?


<h2 style='background-color: #F5F5F5; padding-left: 40px; padding-top: 10px; padding-bottom: 10px; padding-right: 10px; color: Green; font-size: 18px; box-sizing: border-box;'>Neighbors set to default (5)

(0.8089887640449438, 0.7014925373134329)

Neighbors set to 10
<br> 
(0.7592295345104334, 0.7238805970149254)

Neighbors set to 20
<br> 
(0.7383627608346709, 0.746268656716418).

It seems that as you increase the neighbor parameter, you increase the success of the validation data. The train data success goes down as you increase neighbor. 

<h2 class="alert alert-block alert-info" style="font-size: 20px;">7. Which model performs best on our out-of-sample data from validate?


<h2 style='background-color: #F5F5F5; padding-left: 40px; padding-top: 10px; padding-bottom: 10px; padding-right: 10px; color: Green; font-size: 18px; box-sizing: border-box;'>Neighbors set to 20
<br> 
(0.7383627608346709, 0.746268656716418).

In [34]:
#Test a combination of depths and min leaf settings. 
train_acc = []
val_acc = []
neighboor = []

for nb_range in range(1, 20):
    knn = KNeighborsClassifier(n_neighbors= nb_range)
    knn.fit(X_train, y_train)
    
    neighboor.append(nb_range)
    
    train_acc.append(knn.score(X_train, y_train))
    val_acc.append(knn.score(X_val, y_val))


In [35]:
knn_df = pd.DataFrame({'neighboor': neighboor,
                      'train_acc': train_acc,
                      'val_acc': val_acc})

knn_df

Unnamed: 0,neighboor,train_acc,val_acc
0,1,0.969502,0.69403
1,2,0.831461,0.664179
2,3,0.82825,0.686567
3,4,0.791332,0.686567
4,5,0.808989,0.701493
5,6,0.783307,0.701493
6,7,0.797753,0.686567
7,8,0.792937,0.708955
8,9,0.784912,0.708955
9,10,0.75923,0.723881
