# Introduction
In this work, I decided to write my own implementation of the K-Nearest Neighbors algorithm and compare its performance with the algorithm implemented in the Scikit-learn library.
## Importing the required libraries

In [1]:
import numpy as np
import pandas as pd
from typing import List

## Data loading and normalization
Normalization is needed for the classification algorithm to work correctly. Also for more precise classification the dataframe is shuffled (something like stratification).

In [2]:
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-diagnostic-dataset/brca.csv')
df.iloc[:, :-1] = (df.iloc[:, :-1] - df.iloc[:, :-1].min()) / (df.iloc[:, :-1].max() - df.iloc[:, :-1].min())
df = df.sample(frac = 1, random_state = 1).reset_index(drop = True).drop('Unnamed: 0', axis = 1)
df_train = df.iloc[:300, :].copy()
df_test = df.iloc[300:, :].copy()
df.head()

Unnamed: 0,x.radius_mean,x.texture_mean,x.perimeter_mean,x.area_mean,x.smoothness_mean,x.compactness_mean,x.concavity_mean,x.concave_pts_mean,x.symmetry_mean,x.fractal_dim_mean,...,x.texture_worst,x.perimeter_worst,x.area_worst,x.smoothness_worst,x.compactness_worst,x.concavity_worst,x.concave_pts_worst,x.symmetry_worst,x.fractal_dim_worst,y
0,0.352075,0.34021,0.350287,0.211665,0.405254,0.290534,0.219963,0.290209,0.413636,0.293597,...,0.502132,0.294288,0.157589,0.475005,0.267107,0.255112,0.537801,0.227282,0.25246,M
1,0.234228,0.399729,0.226246,0.125175,0.406699,0.181308,0.077976,0.099801,0.317172,0.252317,...,0.376599,0.18492,0.097768,0.520571,0.206275,0.120048,0.249038,0.172088,0.1992,B
2,0.307113,0.147109,0.300809,0.170859,0.49174,0.294829,0.135567,0.26173,0.363131,0.34604,...,0.16791,0.221774,0.116742,0.449911,0.215977,0.147604,0.458763,0.327617,0.250689,B
3,0.223816,0.194116,0.21588,0.117413,0.563059,0.163886,0.093861,0.161531,0.479293,0.318029,...,0.163646,0.168086,0.0814,0.494156,0.081701,0.086821,0.270241,0.236546,0.15099,B
4,0.573572,0.560703,0.589524,0.4193,0.621739,0.489909,0.453843,0.730119,0.289899,0.46925,...,0.551706,0.452662,0.273496,0.44925,0.24683,0.194249,0.632646,0.147053,0.242621,M


# Algorithm implementation
The target variable is assumed to be the last column of the data frame.

In [3]:
#A function that calculates the distance between points
def distance(l1: List[float], l2: List[float]) -> float:
    assert len(l1) == len(l2)
    return sum([(l1_i - l2_i)**2 for l1_i, l2_i in zip(l1, l2)])**0.5

#Search function for the most frequent sample value
def most_frequent(l: List[str]) -> str:
    count = {}
    for l_i in l:
        if l_i in count.keys():
            count[l_i] += 1
        else:
            count[l_i] = 1
    count = sorted(count.items(), key = lambda item: item[1], reverse = True)
    return count[0][0]

#Classification function
def classification(data: List, df: pd.DataFrame, k: int) -> str:
    dist = []
    
    #Calculation of distances to each point of the training sample
    for i in range(df.shape[0]):
        dist.append((i, distance(data, df.iloc[i, :-1])))
    
    #Search for values of the target variable
    dist.sort(key = lambda item: item[1])
    values = [df.iloc[d[0], -1] for d in dist[:k]]
    
    return most_frequent(values)

# Performance comparison
## My algorithm implementation

In [4]:
my_pred = [classification(df_test.iloc[i, :-1], df_train, 3) for i in range(df_test.shape[0])]
l = [(df_test.iloc[i, -1], my_pred[i]) for i in range(df_test.shape[0])]

## Scikit-learn's KNeighborsClassifier

In [5]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors = 3).fit(df_train.iloc[:, :-1], df_train.iloc[:, -1])
sk_pred = neigh.predict(df_test.iloc[:, :-1])
l1 = [(df_test.iloc[i, -1], sk_pred[i]) for i in range(df_test.shape[0])]

## Results

In [6]:
print('My algorithm\'s accuracy:', sum([test == pred for test, pred in l]) / len(l))
print('Scikit-learn\'s accuracy:', sum([test == pred for test, pred in l1]) / len(l1))

My algorithm's accuracy: 0.9516728624535316
Scikit-learn's accuracy: 0.9516728624535316


For this K the accuracy of the algorithms is the same. For other Ks, my algorithm even outperforms KNeighborsClassifier sometimes!