# K nearest neighbors

- [wiki](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm)

Example below uses KNN for binary classification on whether delivery will be late.

In [63]:
import numpy as np

late_data = np.array([[0, 1, 5, 1, 0],
                [1, 0, 7, 0, 1],
                [0, 1, 2, 1, 0],
                [1, 1, 4.2, 1, 0],
                [0, 0, 7.8, 0, 1],
                [1, 0, 3.9, 1, 0],
                [0, 1, 4, 1, 0],
                [1, 1, 2, 0, 0],
                [0, 0, 3.5, 0, 1],
                [1, 0, 2.6, 1, 0],
                [0, 0, 4.1, 0, 1],
                [0, 1, 1.5, 0, 1],
                [1, 1, 1.75, 1, 0],
                [1, 0, 1.3, 0, 0],
                [1, 1, 2.1, 0, 0],
                [1, 1, 0.2, 1, 0],
                [1, 1, 5.2, 0, 1],
                [0, 1, 2, 1, 0],
                [1, 0, 5.5, 0, 1],
                [0, 0, 2, 1, 0],
                [1, 1, 1.7, 0, 0],
                [0, 1, 3, 1, 1],
                [1, 1, 1.9, 1, 0],
                [0, 1, 3.1, 0, 1],
                [0, 1, 2.3, 0, 0],
                [0, 0, 1.1, 1, 0],
                [1, 1, 2.5, 1, 1],
                [1, 1, 5, 0, 1],
                [1, 0, 7.5, 1, 1],
                [0, 0, 0.5, 1, 0],
                [0, 0, 1.5, 1, 0],
                [1, 0, 3.2, 1, 0],
                [0, 0, 2.15, 1, 0],
                [1, 1, 4.2, 0, 1],
                [1, 0, 6.5, 0, 1],
                [1, 0, 0.5, 0, 0],
                [0, 0, 3.5, 0, 1],
                [0, 0, 1.75, 0, 0],
                [1, 1, 5, 0, 1],
                [0, 0, 2, 1, 0],
                [0, 1, 1.3, 1, 1],
                [0, 1, 0.2, 0, 0],
                [1, 1, 2.2, 0, 0],
                [0, 1, 1.2, 1, 0],
                [1, 1, 4.2, 0, 1]])

late_data[1]

array([1., 0., 7., 0., 1.])

In [64]:
late_data.shape

(45, 5)

In [65]:
import pandas as pd

table = pd.DataFrame(late_data, columns=["was_raining", "was_bad_traffic", "distance_miles", "rural_address", "late"])
table.head()

Unnamed: 0,was_raining,was_bad_traffic,distance_miles,rural_address,late
0,0.0,1.0,5.0,1.0,0.0
1,1.0,0.0,7.0,0.0,1.0
2,0.0,1.0,2.0,1.0,0.0
3,1.0,1.0,4.2,1.0,0.0
4,0.0,0.0,7.8,0.0,1.0


In [66]:
table.shape

(45, 5)

In [67]:
target = table["late"].tolist()
input_data = table[["was_raining", "was_bad_traffic", "distance_miles", "rural_address"]].values

In [68]:
print(type(input_data))
print(type(target))

<class 'numpy.ndarray'>
<class 'list'>


In [69]:
from sklearn.neighbors import KNeighborsClassifier

# KNN will consider two other "closest" records to make a decision.
knn_classifier = KNeighborsClassifier(n_neighbors = 2)

knn_classifier.fit(input_data, target)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                     weights='uniform')

In [70]:
test_data = np.array([[0, 0, 2.1, 1]])

print("prediction:", knn_classifier.predict(test_data))

prediction: [0.]


In [71]:
# batch predict
test_data = np.array([[0, 0, 2.1, 1],
                      [0, 1, 5, 0],
                      [1, 1, 3.1, 1]])

print(knn_classifier.predict(test_data))

[0. 0. 1.]


The full dataset is used for training. Let's check the accuracy on the predictions for the training data.

In [72]:
from sklearn.metrics import accuracy_score

preds = knn_classifier.predict(input_data)

accuracy_score(target, preds)

0.8888888888888888

In [73]:
# test with knn with 1 neighbor, 100% accuracy

knn_classifier = KNeighborsClassifier(n_neighbors = 1)
knn_classifier.fit(input_data, target)
preds = knn_classifier.predict(input_data)
accuracy_score(target, preds)

1.0

## Use test data set to avoid overfitting

In [74]:
training_data = table.iloc[:38, :] # First 38, ~ 85%
test_data = table.iloc[38:, :] # Remaining 7, ~15%

# This will print the first 5 rows
training_data.head()

Unnamed: 0,was_raining,was_bad_traffic,distance_miles,rural_address,late
0,0.0,1.0,5.0,1.0,0.0
1,1.0,0.0,7.0,0.0,1.0
2,0.0,1.0,2.0,1.0,0.0
3,1.0,1.0,4.2,1.0,0.0
4,0.0,0.0,7.8,0.0,1.0


In [75]:
target = training_data["late"].tolist()
input_data = training_data[["was_raining", "was_bad_traffic", "distance_miles", "rural_address"]].values

In [76]:
knn_classifier = KNeighborsClassifier(n_neighbors = 1)
knn_classifier.fit(input_data, target)
preds = knn_classifier.predict(input_data)
accuracy_score(target, preds)

1.0

In [78]:
test_target = test_data["late"].tolist()
test_input_data = test_data[["was_raining", "was_bad_traffic", "distance_miles", "rural_address"]].values
test_preds = knn_classifier.predict(test_input_data)
accuracy_score(test_target, test_preds)

0.7142857142857143

71% accuracy for 7 rows of test data. Can we trust this model? Probably no. This can be overfitting. 

For KNN, when K parameter is 1, the model looks at the closest 1 record and assign the class of that record. This doesn't generalize well and "overfits" the dataset.

We can test and look for the best performing K parameter with a validation subset. We pick the K that results in the best validation performance metric of our choice. Then look at test set performance as a double check.