In [59]:
%matplotlib widget

from sklearn.datasets import fetch_covtype
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

In [60]:
# load the full dataset and display it
dataset = fetch_covtype(shuffle=True) # make sure to shuffle the data so no local patterns emerge
feature_names = dataset.feature_names
target_names = dataset.target_names
data = dataset.data
target = dataset.target

Now, we showcase k nearest neighbor algorithm. The idea is the same as simple classifier. Instead of choosing the category based on the closest points, we choose the most common category based on the k nearest points.

In [61]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.33, random_state=1)
print("Train:", X_train.shape, y_train.shape)
print("Test:", X_test.shape, y_test.shape)

Train: (389278, 54) (389278,)
Test: (191734, 54) (191734,)


In [62]:
def predict_point(x, y, point, k):
    point = point.reshape((1,-1))
    x = x - point # mxn
    x2 = x**2 # squared euclidean distances
    dist_squared = np.sum(x2,axis=1) # add up all the squared euclidean distances

    k_indices = np.argsort(dist_squared)[:k] # get only the indices of the k minimum distances
    categories = np.take(y, k_indices) # get the categories associated with the k indices
    category = np.argmax(np.bincount(categories)) # pick the most frequent category within the k categories

    return category # return the class associated with the k closest points

In [63]:
def predict_points(x, y, points, k):
    # same thing as predict_point but we handle multiple points now along with k points
    mp = points.shape[0]
    categories = np.zeros((mp))
    for i in range(mp):
        categories[i] = predict_point(x,y,points[i],k) # use predict_point we defined
    return categories

In [64]:
samples = 100
predictions = predict_points(X_train, y_train, X_test[:samples], 10) # try only 100 points from the testing set with k=10
accuracy = accuracy_score(predictions, y_test[:samples]) # compute the accuracy
print("Accuracy:",accuracy*100,"%") # show the accuracy

Accuracy: 95.0 %


Interesting, accuracy is also nearly the same as simple classifer. Let's use higher k values and see if accuracy improves.

In [65]:
samples = 100
predictions = predict_points(X_train, y_train, X_test[:samples], 100) # try only 100 points from the testing set with k=100 points now
accuracy = accuracy_score(predictions, y_test[:samples]) # compute the accuracy
print("Accuracy:",accuracy*100,"%") # show the accuracy

Accuracy: 81.0 %


Interesting, it became much worse with k=100. Let's try lowering it and see what happens.

In [68]:
kvalues = np.round(np.linspace(1,50,5)).astype(np.int32)
print("K values to test:",kvalues)

K values to test: [ 1 13 26 38 50]


In [69]:
for i in range(kvalues.shape[0]):
    samples = 100
    predictions = predict_points(X_train, y_train, X_test[:samples], kvalues[i]) # try only 100 points from the testing set with k=100 points now
    accuracy = accuracy_score(predictions, y_test[:samples]) # compute the accuracy
    print(f"Accuracy for k={kvalues[i]} - {accuracy*100}%") # show the accuracy

Accuracy for k=1 - 95.0%
Accuracy for k=13 - 94.0%
Accuracy for k=26 - 88.0%
Accuracy for k=38 - 82.0%
Accuracy for k=50 - 84.0%


This is really surprising. When k increases, the accuracy goes down. This means that this dataset is already very aligned and easily distinguishable to the fact we don't even need the k nearest neighbor. In many other datasets, it's better to go with k nearest neighbor in case there are many other outliers. 