# K-Nearest Neighbor

In [6]:
#Tabular data imports:
import numpy as np
import pandas as pd

# Imports we need for assessing the performance of our model:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import MinMaxScaler

# User-Defined Functions
import acquire
import prepare
import model

## Acquire & Prepare Data

In [7]:
# Acquire data
titanic = prepare.clean_titanic(acquire.get_titanic_data())
titanic.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
0,0,3,male,1,0,7.25,Southampton,0
1,1,1,female,1,0,71.2833,Cherbourg,0
2,1,3,female,0,0,7.925,Southampton,1
3,1,1,female,1,0,53.1,Southampton,0
4,0,3,male,0,0,8.05,Southampton,1


In [8]:
titanic = model.preprocess_titanic(titanic)
titanic.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,7.25,0,1,0,1
1,1,1,1,0,71.2833,0,0,0,0
2,1,3,0,0,7.925,1,0,0,1
3,1,1,1,0,53.1,0,0,0,1
4,0,3,0,0,8.05,1,1,0,1


In [9]:
# Train, validate, split data
train, validate, test = prepare.split_data(titanic, 'survived')

train: 534 (60.0% of 891)
validate: 178 (20.0% of 891)
test: 179 (20.0% of 891)


In [10]:
train.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
776,0,3,0,0,7.75,1,1,1,0
829,1,1,0,0,80.0,1,0,0,1
215,1,1,1,0,113.275,0,0,0,0
258,1,1,0,0,512.3292,1,0,0,0
129,0,3,0,0,6.975,1,1,0,1


## Isolate the target

In [12]:
# create two variables from train, one with only features and one for target. 
X_train = train.drop(columns=['survived'])
y_train = train.survived

# create two variables from validate, one with only features and one for target. 
X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

# create two variables from test, one with only features and one for target. 
X_test = test.drop(columns=['survived'])
y_test = test.survived

### 1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [13]:
#Create KNN Object
knn = KNeighborsClassifier()
knn

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [14]:
#Fit the Model to the Training Data
knn.fit(X_train, y_train)

In [15]:
#Make Predictions
y_preds = knn.predict(X_train)
y_preds

array([0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1,

### 2. Evaluate your results using the model score, confusion matrix, and classification report.



In [16]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.83


In [17]:
#confusion matrix
cm = confusion_matrix(y_train, y_preds)
cm

array([[286,  43],
       [ 50, 155]])

In [18]:
#classification report
pd.DataFrame(classification_report(y_train, y_preds, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.85119,0.782828,0.825843,0.817009,0.824947
recall,0.869301,0.756098,0.825843,0.812699,0.825843
f1-score,0.86015,0.769231,0.825843,0.814691,0.825247
support,329.0,205.0,0.825843,534.0,534.0


In [19]:
# OR

In [20]:
print(classification_report(y_train, y_preds))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86       329
           1       0.78      0.76      0.77       205

    accuracy                           0.83       534
   macro avg       0.82      0.81      0.81       534
weighted avg       0.82      0.83      0.83       534



### 3.Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



In [22]:
# function to print classification metrics
def print_classification_metrics(actuals, predictions):
    '''
    Takes in acutal results and predicted results
    Retruns: classification metrics.
    '''
    TN, FP, FN, TP = confusion_matrix(actuals, predictions).ravel()
    ALL = TP + TN + FP + FN
    
    accuracy = (TP + TN)/ALL
    print(f"Accuracy: {accuracy}")

    true_positive_rate = TP/(TP+FN)
    print(f"True Positive Rate: {true_positive_rate}")

    false_positive_rate = FP/(FP+TN)
    print(f"False Positive Rate: {false_positive_rate}")

    true_negative_rate = TN/(TN+FP)
    print(f"True Negative Rate: {true_negative_rate}")

    false_negative_rate = FN/(FN+TP)
    print(f"False Negative Rate: {false_negative_rate}")

    precision = TP/(TP+FP)
    print(f"Precision: {precision}")

    recall = TP/(TP+FN)
    print(f"Recall: {recall}")

    f1_score = 2*(precision*recall)/(precision+recall)
    print(f"F1 Score: {f1_score}")

    support_pos = TP + FN
    print(f"Support (0): {support_pos}")

    support_neg = FP + TN
    print(f"Support (1): {support_neg}")

In [23]:
# use function
print_classification_metrics(y_train, y_preds)

Accuracy: 0.8258426966292135
True Positive Rate: 0.7560975609756098
False Positive Rate: 0.13069908814589665
True Negative Rate: 0.8693009118541033
False Negative Rate: 0.24390243902439024
Precision: 0.7828282828282829
Recall: 0.7560975609756098
F1 Score: 0.7692307692307692
Support (0): 205
Support (1): 329


### 4.Run through steps 1-3 setting k to 10



In [24]:
knn = KNeighborsClassifier(n_neighbors=10)

In [25]:
knn.fit(X_train, y_train)

In [28]:
y_preds = knn.predict(X_train)
y_preds

array([0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,

In [29]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.79


In [30]:
confusion_matrix(y_train, y_preds)

array([[287,  42],
       [ 71, 134]])

In [31]:
print(classification_report(y_train, y_preds))

              precision    recall  f1-score   support

           0       0.80      0.87      0.84       329
           1       0.76      0.65      0.70       205

    accuracy                           0.79       534
   macro avg       0.78      0.76      0.77       534
weighted avg       0.79      0.79      0.78       534



In [33]:
print_classification_metrics(y_train, y_preds)

Accuracy: 0.7883895131086143
True Positive Rate: 0.6536585365853659
False Positive Rate: 0.1276595744680851
True Negative Rate: 0.8723404255319149
False Negative Rate: 0.3463414634146341
Precision: 0.7613636363636364
Recall: 0.6536585365853659
F1 Score: 0.7034120734908137
Support (0): 205
Support (1): 329


### 5.Run through steps 1-3 setting k to 20




In [34]:
knn = KNeighborsClassifier(n_neighbors=20)

In [35]:
knn.fit(X_train, y_train)

In [36]:
y_preds = knn.predict(X_train)
y_preds


array([0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,

In [37]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.73


In [38]:
confusion_matrix(y_train, y_preds)

array([[283,  46],
       [ 96, 109]])

In [39]:
print(classification_report(y_train, y_preds))

              precision    recall  f1-score   support

           0       0.75      0.86      0.80       329
           1       0.70      0.53      0.61       205

    accuracy                           0.73       534
   macro avg       0.72      0.70      0.70       534
weighted avg       0.73      0.73      0.73       534



In [41]:
print_classification_metrics(y_train, y_preds)

Accuracy: 0.7340823970037453
True Positive Rate: 0.5317073170731708
False Positive Rate: 0.1398176291793313
True Negative Rate: 0.8601823708206687
False Negative Rate: 0.4682926829268293
Precision: 0.7032258064516129
Recall: 0.5317073170731708
F1 Score: 0.6055555555555556
Support (0): 205
Support (1): 329


### 6.What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?



In [42]:
pred_dict = {
    'model': [],
    'accuracy': [],
    'true_positive_rate': [],
    'false_positive_rate': [],
    'true_negative_rate': [],
    'false_negative_rate': [],
    'precision': [],
    'recall': [],
    'f1_score': [],
    'support_0': [],
    'support_1': []
}

# show evaluation metrics for three models with three different hyperparameters
for n in [5, 10, 20]:
    knn = KNeighborsClassifier(n_neighbors=n)
    
    knn.fit(X_train, y_train)
    
    y_preds = knn.predict(X_train)
    
    TN, FP, FN, TP = confusion_matrix(y_train, y_preds).ravel()
    ALL = TP + TN + FP + FN
    
    accuracy = (TP + TN)/ALL
    true_positive_rate = TP/(TP+FN)
    false_positive_rate = FP/(FP+TN)
    true_negative_rate = TN/(TN+FP)
    false_negative_rate = FN/(FN+TP)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1_score = 2*(precision*recall)/(precision+recall)
    support_pos = TP + FN
    support_neg = FP + TN
    
    pred_dict['model'].append(f'knn_n_{n}')
    pred_dict['accuracy'].append(accuracy)
    pred_dict['true_positive_rate'].append(true_positive_rate)
    pred_dict['false_positive_rate'].append(false_positive_rate)
    pred_dict['true_negative_rate'].append(true_negative_rate)
    pred_dict['false_negative_rate'].append(false_negative_rate)
    pred_dict['precision'].append(precision)
    pred_dict['recall'].append(recall)
    pred_dict['f1_score'].append(f1_score)
    pred_dict['support_0'].append(support_pos)
    pred_dict['support_1'].append(support_neg)


In [43]:
pd.DataFrame(pred_dict)


Unnamed: 0,model,accuracy,true_positive_rate,false_positive_rate,true_negative_rate,false_negative_rate,precision,recall,f1_score,support_0,support_1
0,knn_n_5,0.825843,0.756098,0.130699,0.869301,0.243902,0.782828,0.756098,0.769231,205,329
1,knn_n_10,0.78839,0.653659,0.12766,0.87234,0.346341,0.761364,0.653659,0.703412,205,329
2,knn_n_20,0.734082,0.531707,0.139818,0.860182,0.468293,0.703226,0.531707,0.605556,205,329


In [44]:
pd.DataFrame(pred_dict).T

Unnamed: 0,0,1,2
model,knn_n_5,knn_n_10,knn_n_20
accuracy,0.825843,0.78839,0.734082
true_positive_rate,0.756098,0.653659,0.531707
false_positive_rate,0.130699,0.12766,0.139818
true_negative_rate,0.869301,0.87234,0.860182
false_negative_rate,0.243902,0.346341,0.468293
precision,0.782828,0.761364,0.703226
recall,0.756098,0.653659,0.531707
f1_score,0.769231,0.703412,0.605556
support_0,205,205,205


In [45]:
# Compare models
train_pred_df = pd.DataFrame(pred_dict).T
train_pred_df.columns = train_pred_df.iloc[0]
train_pred_df = train_pred_df.drop(train_pred_df.index[0])
train_pred_df

model,knn_n_5,knn_n_10,knn_n_20
accuracy,0.825843,0.78839,0.734082
true_positive_rate,0.756098,0.653659,0.531707
false_positive_rate,0.130699,0.12766,0.139818
true_negative_rate,0.869301,0.87234,0.860182
false_negative_rate,0.243902,0.346341,0.468293
precision,0.782828,0.761364,0.703226
recall,0.756098,0.653659,0.531707
f1_score,0.769231,0.703412,0.605556
support_0,205.0,205.0,205.0
support_1,329.0,329.0,329.0


> model knn_n_5 performs better becuase smaller numbers for n_neighbors are able to capture details within the training set, but this carries a risk of overfitting.

### 7.Which model performs best on our out-of-sample data from validate?


In [48]:
pred_dict = {
    'model': [],
    'accuracy': [],
    'true_positive_rate': [],
    'false_positive_rate': [],
    'true_negative_rate': [],
    'false_negative_rate': [],
    'precision': [],
    'recall': [],
    'f1_score': [],
    'support_0': [],
    'support_1': []
}

# for loop to set up model comparision
for n in [5, 10, 20]:
    knn = KNeighborsClassifier(n_neighbors=n)
    
    knn.fit(X_train, y_train)
    
    y_preds = knn.predict(X_validate)
    
    TN, FP, FN, TP = confusion_matrix(y_validate, y_preds).ravel()
    ALL = TP + TN + FP + FN
    
    accuracy = (TP + TN)/ALL
    true_positive_rate = TP/(TP+FN)
    false_positive_rate = FP/(FP+TN)
    true_negative_rate = TN/(TN+FP)
    false_negative_rate = FN/(FN+TP)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1_score = 2*(precision*recall)/(precision+recall)
    support_pos = TP + FN
    support_neg = FP + TN
    
    pred_dict['model'].append(f'knn_n_{n}')
    pred_dict['accuracy'].append(accuracy)
    pred_dict['true_positive_rate'].append(true_positive_rate)
    pred_dict['false_positive_rate'].append(false_positive_rate)
    pred_dict['true_negative_rate'].append(true_negative_rate)
    pred_dict['false_negative_rate'].append(false_negative_rate)
    pred_dict['precision'].append(precision)
    pred_dict['recall'].append(recall)
    pred_dict['f1_score'].append(f1_score)
    pred_dict['support_0'].append(support_pos)
    pred_dict['support_1'].append(support_neg)

In [49]:
# compare models
val_pred_df = pd.DataFrame(pred_dict).T
val_pred_df.columns = val_pred_df.iloc[0]
val_pred_df = val_pred_df.drop(val_pred_df.index[0])
val_pred_df


model,knn_n_5,knn_n_10,knn_n_20
accuracy,0.707865,0.668539,0.662921
true_positive_rate,0.676471,0.632353,0.544118
false_positive_rate,0.272727,0.309091,0.263636
true_negative_rate,0.727273,0.690909,0.736364
false_negative_rate,0.323529,0.367647,0.455882
precision,0.605263,0.558442,0.560606
recall,0.676471,0.632353,0.544118
f1_score,0.638889,0.593103,0.552239
support_0,68.0,68.0,68.0
support_1,110.0,110.0,110.0


>  knn_n_5 performs best on accuracy