In [1]:
#all the necessary imports
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [2]:
#implement Euclidian distance to calculate similarity
def euclid_m(x_train, x_test):
    '''
    X_train - train data
    X_test - test data
    Returns: np.array of distances between each pair of train and test data correspondingly
    '''
    
    n_samples1 = X_train.shape[0]
    n_samples2 = X_test.shape[0]
    
    dists = np.zeros((n_samples2, n_samples1))
    for i in range(n_samples2):
        for j in range(n_samples1):
            dists[i, j] = np.linalg.norm(X_train[j] - X_test[i])
    
    return dists

In [3]:
#find labels of k nearest data points
def k_labels(dists, y_train, k):
    '''
    dists - array of distances between each pair of train and test data correspondingly
    y_train - labels of train data
    k - number of nearest neighbours
    Returns: labels of k nearest neighbours for each sample
    '''
    n_samples = dists.shape[0]
    n_nearest = []
    
    for i in range(n_samples):
        dst = dists[i]
        labels = y_train[np.argpartition(dst, k-1)[:k]]
        n_nearest.append(labels)
        
    return np.asarray(n_nearest)

In [4]:
#K-nearest Neighbours in SKLearn style
class KNN():
    
    def __init__(self, k):
        self.k = k
        self.x_test = None
        self.x_train = None
        self.y_train = None

    def fit(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train
    
    def predict(self, x_test):
        self.x_test = x_test
        dsts = euclid_m(self.x_train, self.x_test)
        votes = k_labels(dsts, self.y_train, self.k)
        prediction = []
        
        for row in votes:
            unique, counts = np.unique(row, return_counts=True)
            prediction.append(unique[np.argmax(counts)])
        
        return np.array(prediction)

In [5]:
#let`s load iris data set to test K-NN performance
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.33, random_state=17)

In [6]:
#explore train/test shapes
print(f'Train data set has {X_train.shape[0]} observations and {X_train.shape[1]} features')
print(f'Test data set has {X_test.shape[0]} observations and {X_test.shape[1]} features')

Train data set has 100 observations and 4 features
Test data set has 50 observations and 4 features


In [7]:
#initialize and train the model
knn1 = KNN(k=4)
knn1.fit(X_train, y_train)

In [8]:
#make predictions and calculate accuracy
y_pred = knn1.predict(X_test)
print(f'Accuracy score: {accuracy_score(y_test, y_pred)}')

Accuracy score: 0.96


In [9]:
#compare the results obtained with sklearn`s implementation
knn2 = KNeighborsClassifier(n_neighbors=4, metric='minkowski', p=2, n_jobs=4)
knn2.fit(X_train, y_train)
y_pred = knn2.predict(X_test)
print(f'Accuracy score: {accuracy_score(y_test, y_pred)}')

Accuracy score: 0.96
