# Introduction

A basic implementation of the k-nearest-neighbors algorithm.

# Setup

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [16]:
# Load data

df = pd.read_csv("../../data/Iris.csv")
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [17]:
# Drop Id

df = df.drop(columns=["Id"])

In [153]:
# Seperate features and labels

features = df.drop(columns=["Species"]).values
labels = df["Species"].unique()
labels_encoded = np.argmax(pd.get_dummies(df["Species"]).astype(int).values, axis=1)

features[:5], labels_encoded[:5], labels

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2]]),
 array([0, 0, 0, 0, 0], dtype=int64),
 array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object))

In [154]:
# Train and test splits

X_train, X_test, y_train, y_test = train_test_split(features, labels_encoded, test_size=0.2)

In [155]:
print(f'Training dataset size: {len(X_train)}')
print(f'Test dataset size: {len(X_test)}')

Training dataset size: 120
Test dataset size: 30


# Model

In [72]:
# This works on lists of points, not single points.
def euclidean_distance(v1, v2):
    return np.sqrt(np.sum((v1 - v2) ** 2, axis=1))

In [None]:
# banger implementation btw
def knn(k, X_train, y_train, target):
    batch_size, n_features = X_train.shape
    n_classes = len(labels)

    target = target.reshape((1, n_features))
    target_mat = np.repeat(target, batch_size, axis=0) # Repeat target on 0th axis to match the shapes
    dist = euclidean_distance(X_train, target_mat) # Calculate euclidean distances
    indicies = np.argsort(dist)[:k] # k nearest neighbours
    pred_classes = y_train[indicies]
    pred = np.argmax(np.bincount(pred_classes, minlength=n_classes))
    return pred

In [257]:
for k in range(1, 100, features.shape[1]):
    y_preds = [] 
    for i in range(len(X_test)):
        y_preds.append(knn(k, X_train, y_train, X_test[i]))
    
    accuracy = np.sum(np.array(y_preds) == y_test) / len(y_test)

    print(f"k: {k} | Accuracy: {accuracy*100:.2f}%")

k: 1 | Accuracy: 36.67%
k: 5 | Accuracy: 36.67%
k: 9 | Accuracy: 36.67%
k: 13 | Accuracy: 36.67%
k: 17 | Accuracy: 36.67%
k: 21 | Accuracy: 36.67%
k: 25 | Accuracy: 36.67%
k: 29 | Accuracy: 36.67%
k: 33 | Accuracy: 36.67%
k: 37 | Accuracy: 36.67%
k: 41 | Accuracy: 36.67%
k: 45 | Accuracy: 36.67%
k: 49 | Accuracy: 36.67%
k: 53 | Accuracy: 36.67%
k: 57 | Accuracy: 36.67%
k: 61 | Accuracy: 36.67%
k: 65 | Accuracy: 36.67%
k: 69 | Accuracy: 36.67%
k: 73 | Accuracy: 36.67%
k: 77 | Accuracy: 36.67%
k: 81 | Accuracy: 36.67%
k: 85 | Accuracy: 36.67%
k: 89 | Accuracy: 36.67%
k: 93 | Accuracy: 36.67%
k: 97 | Accuracy: 36.67%


# Conclusion

K-Nearest-Neighbors algorithm fails in Iris classification, achieving up to ony 36.67% accuracy. Every possible k value yields the same result. This is probably because the structure of the dataset; Iris-setosa can be easily distinguished by the other two, and the kNN algorithm guesses correctly about ~1/3 of the time.

Metrics:
- Highest accuracy achieved on test dataset: 36.67%