In [9]:
# DS Libraries
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# knn submodules from scikit learn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix

# Data Acquisition
from prepare import *

import warnings
warnings.filterwarnings("ignore")

In [13]:
titanic = prep_titanic()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,22.0,1,0,7.25,Southampton,0,1,0,1
1,1,1,1,female,38.0,1,0,71.2833,Cherbourg,0,0,0,0
2,2,1,3,female,26.0,0,0,7.925,Southampton,1,0,0,1
3,3,1,1,female,35.0,1,0,53.1,Southampton,0,0,0,1
4,4,0,3,male,35.0,0,0,8.05,Southampton,1,1,0,1


In [31]:
train, validate, test = train_test_validate(titanic, 'survived')
x_train = train.drop(columns=['sex', 'survived', 'embark_town', 'passenger_id'])
y_train = train.survived



x_val = validate.drop(columns=['sex', 'survived', 'embark_town', 'passenger_id'])
y_val = validate.survived

x_test = test.drop(columns=['sex', 'survived', 'embark_town', 'passenger_id'])
y_test = test.survived

In [32]:
y_train

583    0
22     1
878    0
15     1
101    0
      ..
573    1
95     0
396    0
245    0
707    1
Name: survived, Length: 498, dtype: int64

#### 1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [34]:
knn = KNeighborsClassifier(n_neighbors=1)

In [35]:
knn.fit(x_train, y_train)

In [36]:
y_pred = knn.predict(x_train)

In [37]:
y_pred_probability = knn.predict_proba(x_train)

In [41]:
x_train

Unnamed: 0,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,1,36.000000,0,0,40.1250,1,1,0,0
22,3,15.000000,0,0,8.0292,1,0,1,0
878,3,29.699118,0,0,7.8958,1,1,0,1
15,2,55.000000,0,0,16.0000,1,0,0,1
101,3,29.699118,0,0,7.8958,1,1,0,1
...,...,...,...,...,...,...,...,...,...
573,3,29.699118,0,0,7.7500,1,0,1,0
95,3,29.699118,0,0,8.0500,1,1,0,1
396,3,31.000000,0,0,7.8542,1,0,0,1
245,1,44.000000,2,0,90.0000,0,1,1,0


#### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [58]:
conf_knn = confusion_matrix(y_train, y_pred)

In [39]:
pd.crosstab(y_train, y_pred)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,303,4
1,1,190


#### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [51]:
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.996711,0.979381,0.98996,0.988046,0.990064
recall,0.986971,0.994764,0.98996,0.990868,0.98996
f1-score,0.991817,0.987013,0.98996,0.989415,0.989974
support,307.0,191.0,0.98996,498.0,498.0


In [59]:
conf_knn.ravel()

array([303,   4,   1, 190])

In [61]:
TN, FP, FN, TP = conf_knn.ravel()

In [62]:
accuracy = (TP + TN) / (TP + TN + FP + FN)
TPR = recall = TP / (TP + FN)
FPR = FP / (FP + TN)
TNR = TN / (FP + TN)
FNR = FN / (FN + TP)
precision =  TP / (TP + FP)
f1 =  2 * ((precision * recall) / ( precision + recall))
support_0 = train[train.survived == 0].shape[0]
support_1 = train[train.survived == 1].shape[0]

In [129]:
my_list = [accuracy, TPR, FPR, TNR, FNR, precision, recall, f1, support_0, support_1]
accuracy.astype(float)
type(accuracy)

numpy.float64

#### 4. Run through steps 1-3 setting k to 10

In [100]:
model_accuracies = {}

for i in range(1, 11):
    nknn = KNeighborsClassifier(n_neighbors = i)
    nknn.fit(x_train, y_train)
    model_accuracies[f"{i}-Neighbors"] = {
        'Train Score': round(nknn.score(x_train, y_train), 2),
        'Validate Score': round(nknn.score(x_val, y_val), 2)
        
    }
model_accuracies    

{'1-Neighbors': {'Train Score': 0.99, 'Validate Score': 0.73},
 '2-Neighbors': {'Train Score': 0.84, 'Validate Score': 0.71},
 '3-Neighbors': {'Train Score': 0.85, 'Validate Score': 0.74},
 '4-Neighbors': {'Train Score': 0.78, 'Validate Score': 0.72},
 '5-Neighbors': {'Train Score': 0.79, 'Validate Score': 0.71},
 '6-Neighbors': {'Train Score': 0.77, 'Validate Score': 0.72},
 '7-Neighbors': {'Train Score': 0.77, 'Validate Score': 0.71},
 '8-Neighbors': {'Train Score': 0.74, 'Validate Score': 0.71},
 '9-Neighbors': {'Train Score': 0.75, 'Validate Score': 0.71},
 '10-Neighbors': {'Train Score': 0.74, 'Validate Score': 0.72}}

#### 5. Run through steps 1-3 setting k to 20

In [101]:
model_accuracies = {}

for i in range(1, 21):
    nknn = KNeighborsClassifier(n_neighbors = i)
    nknn.fit(x_train, y_train)
    model_accuracies[f"{i}-Neighbors"] = {
        'Train Score': round(nknn.score(x_train, y_train), 2),
        'Validate Score': round(nknn.score(x_val, y_val), 2)
        
    }
model_accuracies    

{'1-Neighbors': {'Train Score': 0.99, 'Validate Score': 0.73},
 '2-Neighbors': {'Train Score': 0.84, 'Validate Score': 0.71},
 '3-Neighbors': {'Train Score': 0.85, 'Validate Score': 0.74},
 '4-Neighbors': {'Train Score': 0.78, 'Validate Score': 0.72},
 '5-Neighbors': {'Train Score': 0.79, 'Validate Score': 0.71},
 '6-Neighbors': {'Train Score': 0.77, 'Validate Score': 0.72},
 '7-Neighbors': {'Train Score': 0.77, 'Validate Score': 0.71},
 '8-Neighbors': {'Train Score': 0.74, 'Validate Score': 0.71},
 '9-Neighbors': {'Train Score': 0.75, 'Validate Score': 0.71},
 '10-Neighbors': {'Train Score': 0.74, 'Validate Score': 0.72},
 '11-Neighbors': {'Train Score': 0.75, 'Validate Score': 0.74},
 '12-Neighbors': {'Train Score': 0.73, 'Validate Score': 0.71},
 '13-Neighbors': {'Train Score': 0.74, 'Validate Score': 0.72},
 '14-Neighbors': {'Train Score': 0.75, 'Validate Score': 0.72},
 '15-Neighbors': {'Train Score': 0.74, 'Validate Score': 0.71},
 '16-Neighbors': {'Train Score': 0.74, 'Validate 

#### 6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

3 neighbors seem to be the best, but I cant tell you why

#### 7. Which model performs best on our out-of-sample data from validate?

In [None]:
#1 neighbor had the best validate score