# 🎯 K-Nearest Neighbors (KNN): Complete Implementation

## Learning Objectives
1. Understand distance metrics mathematically
2. Implement KNN for classification & regression
3. Optimize with k-d trees
4. Visualize decision boundaries


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.datasets import make_classification, make_moons
from sklearn.model_selection import train_test_split
print('✅ Setup complete!')


---
# Chapter 1: Distance Metrics

## Euclidean Distance
$$d(\mathbf{x}, \mathbf{y}) = \sqrt{\sum_{i=1}^{n} (x_i - y_i)^2}$$

## Manhattan Distance
$$d(\mathbf{x}, \mathbf{y}) = \sum_{i=1}^{n} |x_i - y_i|$$

## Minkowski Distance (generalization)
$$d(\mathbf{x}, \mathbf{y}) = \left(\sum_{i=1}^{n} |x_i - y_i|^p\right)^{1/p}$$


In [None]:
class KNN:
    def __init__(self, k=3, metric='euclidean', weighted=False):
        self.k = k
        self.metric = metric
        self.weighted = weighted
    
    def _distance(self, x1, x2):
        if self.metric == 'euclidean':
            return np.sqrt(np.sum((x1 - x2) ** 2))
        elif self.metric == 'manhattan':
            return np.sum(np.abs(x1 - x2))
        elif self.metric == 'minkowski':
            p = 3
            return np.sum(np.abs(x1 - x2) ** p) ** (1/p)
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        return self
    
    def predict(self, X):
        return np.array([self._predict_single(x) for x in X])
    
    def _predict_single(self, x):
        # Compute distances to all training samples
        distances = [self._distance(x, x_train) for x_train in self.X_train]
        
        # Get k nearest neighbors
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_indices]
        
        if self.weighted:
            k_distances = np.array(distances)[k_indices]
            weights = 1 / (k_distances + 1e-10)
            labels, counts = np.unique(k_nearest_labels, return_counts=True)
            weighted_counts = {}
            for i, label in enumerate(k_nearest_labels):
                weighted_counts[label] = weighted_counts.get(label, 0) + weights[i]
            return max(weighted_counts, key=weighted_counts.get)
        else:
            return Counter(k_nearest_labels).most_common(1)[0][0]


In [None]:
# Test on moons dataset
X, y = make_moons(n_samples=200, noise=0.1, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

knn = KNN(k=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc = np.mean(y_pred == y_test)
print(f'Test Accuracy: {acc:.4f}')

# Visualize decision boundary
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(figsize=(10, 7))
plt.contourf(xx, yy, Z, alpha=0.3, cmap='viridis')
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', edgecolors='k')
plt.title(f'KNN Decision Boundary (k={knn.k})')
plt.show()
