KNN design and implementation

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import cdist

1

In [2]:
df = pd.read_csv(
"https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data",
header=None,
)

2 and 3

In [3]:
X, y = df.iloc[:, 0:4].values, df.iloc[:, 4].values

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

4

In [10]:
class KNearestNeighbors:
    def __init__(self, k, distance_metric="euclidean"):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None
    
    def fit(self, X, y):
        """
        Store the 'prior knowledge' of you model that will be used
        to predict new labels.
        :param X : input data points, ndarray, shape = (R,C).
        :param y : input labels, ndarray, shape = (R,).
        """
        self.X_train = X
        self.y_train = y
        return self

    def predict(self, X):
        """Run the KNN classification on X.
        :param X: input data points, ndarray, shape = (N,C).
        :return: labels : ndarray, shape = (N,).
        """
        # 1. Compute the distances between X and X_train
        distances = self.__distance(X)

        # 2. Find the k nearest neighbors
        k_nearest_neighbors = self.__nearest_neighbors(distances)

        # 3. Find the majority class among the k nearest neighbors
        return self.__majority_class(k_nearest_neighbors)


    def __distance(self, X):
        return cdist(X, self.X_train, metric=self.distance_metric)
    
    def __nearest_neighbors(self, distances):
        return np.argsort(distances, axis=1)[:, :self.k]
    
    def __majority_class(self, k_nearest_neighbors):
        return np.apply_along_axis(self.__counting_votes, axis=1, arr=self.y_train[k_nearest_neighbors])
    
    def __counting_votes(self, votes):
        labels, counts = np.unique(votes, return_counts=True)
        return labels[np.argmax(counts)]
        

In [12]:
knn = KNearestNeighbors(k=3).fit(coords2, y)

AttributeError: 'KNearestNeighbors' object has no attribute '__counting_votes'

In [3]:
coords1 = [(35.0456, -85.2672),
          (35.1174, -89.9711),
          (35.9728, -83.9422),
          (36.1667, -86.7833),
          (36.1473, -86.7770),
          (36.0544, -86.6694),]

coords2 = [(35.0456, -85.2672),
          (35.1174, -89.9711),
          (35.9728, -83.9422),
          (29.9728, -83.9422),
          (35.0457, -84.9711),]
distances = cdist(coords1, coords2, 'euclidean')
distances

array([[0.        , 4.70444794, 1.6171966 , 5.24298816, 0.29610002],
       [4.70444794, 0.        , 6.0892811 , 7.92556272, 5.00051406],
       [1.6171966 , 6.0892811 , 0.        , 6.        , 1.38497279],
       [1.88558331, 3.35605413, 2.84770898, 6.81441461, 2.13089414],
       [1.86902085, 3.35603469, 2.84016572, 6.79415494, 2.11537169],
       [1.72738018, 3.43208273, 2.7284205 , 6.66509403, 1.97527177]])

In [4]:
y = np.array([0, 0, 1, 1, 1])

In [36]:
args = np.argsort(distances, axis=1)
args

array([[0, 4, 2, 1, 3],
       [1, 0, 4, 2, 3],
       [2, 4, 0, 3, 1],
       [0, 4, 2, 1, 3],
       [0, 4, 2, 1, 3],
       [0, 4, 2, 1, 3]], dtype=int64)

In [44]:
k_mas_cercanos = args[:, :3]
k_mas_cercanos

array([[0, 4, 2],
       [1, 0, 4],
       [2, 4, 0],
       [0, 4, 2],
       [0, 4, 2],
       [0, 4, 2]], dtype=int64)

In [75]:
y[k_mas_cercanos]

array([[0, 1, 1],
       [0, 0, 1],
       [1, 1, 0],
       [0, 1, 1],
       [0, 1, 1],
       [0, 1, 1]])

In [117]:
y = np.array([0, 0, 1, 1, 1])

In [126]:
def most_voted(votes):
    labels, counts = np.unique(votes, return_counts=True)
    return labels[np.argmax(counts)]
    
np.apply_along_axis(most_voted, axis=1, arr=y[k_mas_cercanos])

array([1, 0, 1, 1, 1, 1])

In [78]:
labels, counts = np.unique(y[k_mas_cercanos][0], return_counts=True)
labels, counts

(array([0, 1]), array([1, 2], dtype=int64))

In [79]:
most_voted = labels[counts.argmax()]

In [80]:
most_voted

1