# K-nearest neighbors

In [1]:
import numpy as np
from collections import Counter
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

## Create the model

- **Euclidean distance**:
$d(p, q) = \sqrt{\sum_{i=1}^{n}(p_i - q_i)^2}$

In [2]:
class KNN:
    def __init__(self, k: int = 3) -> None:
        self.k = k

    def _euclidean_distance(self, x1, x2) -> float:
        return np.sqrt(np.sum((x2 - x1)**2))

    def _get_most_common(self, x: np.ndarray) -> int:
        distances = [self._euclidean_distance(x, x_train) for x_train in self.X_train]
        indices = np.argsort(distances)[: self.k]
        labels = [self.y_train[i] for i in indices]
        return Counter(labels).most_common(1)[0][0]

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        self.X_train = X
        self.y_train = y

    def predict(self, X: np.ndarray) -> np.ndarray:
        return np.array([self._get_most_common(x) for x in X])

    def score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        return np.mean(y_true == y_pred)

## Test the model

In [3]:
iris = load_iris()
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, shuffle=True, random_state=0)

In [4]:
model = KNN()
model.fit(x_train, y_train)
pred = model.predict(x_test)

In [5]:
print(f"Score on training data: {model.score(y_train, model.predict(x_train))}")
print(f"Score on testing data: {model.score(y_test, pred)}")

Score on training data: 0.95
Score on testing data: 0.9666666666666667
