## MISA (2024-2025)
- Alohan'ny mamerina dia avereno atao Run ny notebook iray manontolo. Ny fanaovana azy dia redémarrena mihitsy ny kernel aloha (jereo menubar, safidio **Kernel$\rightarrow$Restart Kernel and Run All Cells**).

- Izay misy hoe `YOUR CODE HERE` na `YOUR ANSWER HERE` ihany no fenoina. Afaka manampy cells vaovao raha ilaina. Aza adino ny mameno references eo ambany raha ilaina.

## References
Eto ilay references rehetra no apetraka

* https://datascientest.com/knn
* https://cendikiaishmatuka.medium.com/k-nearest-neighbors-knn-implementation-using-python-a1ea9d89f582

---

In [1]:
# AZA MANAMPY CODE ATO FA MNAOVA CELLULE VAOVAO

from random import randrange
import numpy as np
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.datasets import load_diabetes, load_iris, load_digits
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore")

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

# K-Nearest Neighbor

## Computing distances

In [2]:
data = load_digits()
X_train3, y_train3 = data.data, data.target
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_train3, y_train3, test_size=0.33, random_state=2)

def get_distances_two_loops_with_norm(X_train, X_test):
    num_test = X_test.shape[0]
    num_train = X_train.shape[0]
    distances = np.zeros((num_test, num_train))
    for i in range(num_test):
        for j in range(num_train):
            distances[i, j] = np.linalg.norm(X_test[i] - X_train[j])
    return distances

In [3]:
def get_distances_two_loops(X_train, X_test):
    """
    Compute the distance between each test point in X_test and each training point
    in X_train 

    Inputs:
    - X_test: array of shape (num_test, D) 

    Returns:
    - distances: array of shape (num_test, num_train), dists[i, j] is Euclidean distance between 
    the ith test point and the jth training point.
    """
    num_test = X_test.shape[0]
    num_train = X_train.shape[0]
    distances = np.zeros((num_test, num_train))
    for i in range(num_test):
        for j in range(num_train):
            distances[i, j] = np.sqrt(np.sum((X_test[i] - X_train[j]) ** 2))
    return distances


In [4]:
distances = get_distances_two_loops(X_train3, X_test3)
true_distances = get_distances_two_loops_with_norm(X_train3, X_test3)

difference = np.linalg.norm(distances - true_distances, ord='fro')

print(difference)
assert difference < 1e-10

0.0


In [5]:
def compute_distances_one_loop(X_train, X_test):
    """
    Compute the distance between each test point in X_test and each training point
    in X_train.

    Inputs:
    - X_train: array of shape (num_train, D) 
    - X_test: array of shape (num_test, D) 

    Returns:
    - distances: array of shape (num_test, num_train), distances[i, j] is the Euclidean distance between 
    the ith test point and the jth training point.
    """
    num_test = X_test.shape[0]
    num_train = X_train.shape[0]
    distances = np.zeros((num_test, num_train))
    for i in range(num_test):
        # Calculate the squared differences
        squared_diffs = (X_train - X_test[i, :]) ** 2
        # Sum the squared differences along the feature dimension
        squared_distances = np.sum(squared_diffs, axis=1)
        # Take the square root to get the Euclidean distance
        distances[i, :] = np.sqrt(squared_distances)
    
    return distances


In [6]:
distances = compute_distances_one_loop(X_train3, X_test3)
true_distances = get_distances_two_loops_with_norm(X_train3, X_test3)

difference = np.linalg.norm(distances - true_distances, ord='fro')

print(difference)
assert difference < 1e-10

0.0


In [7]:
def get_distances_zero_loop(X_train, X_test):
    """
    Compute the distance between each test point in X_test and each training point
    in X_train without any explicit loops.

    Inputs:
    - X_train: array of shape (num_train, D) 
    - X_test: array of shape (num_test, D) 

    Returns:
    - distances: array of shape (num_test, num_train), where distances[i, j] is 
      the Euclidean distance between the ith test point and the jth training point.
    """
    # Compute squared terms for broadcasting
    test_squared = np.sum(X_test ** 2, axis=1).reshape(-1, 1)  # Shape (num_test, 1)
    train_squared = np.sum(X_train ** 2, axis=1).reshape(1, -1)  # Shape (1, num_train)

    # Compute cross term
    cross_term = np.dot(X_test, X_train.T)  # Shape (num_test, num_train)

    # Calculate the distances
    distances = np.sqrt(test_squared - 2 * cross_term + train_squared)

    return distances


In [8]:
distances = get_distances_zero_loop(X_train3, X_test3)
true_distances = get_distances_two_loops_with_norm(X_train3, X_test3)

difference = np.linalg.norm(distances - true_distances, ord='fro')

print(difference)
assert difference < 1e-10

0.0


## K-Nearest Neighbor (knn) classifier

In [43]:
class KNearestNeighborClassifier:
    """ kNN classifier using L2 distance """

    def __init__(self, k=1):
        """
        Inputs:
        - k: number of nearest neighbors that vote for the predicted labels.
        """
        self.k = k

    def fit(self, X, y):
        """
        Train the classifier. Just memorize the training data.

        Inputs:
        - X: array of shape (num_train, D) 
        - y: array of shape (num_train,) 
        """
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        """
        Predict labels for test data using this classifier.

        Inputs:
        - X: array of shape (num_test, D) 

        Returns:
        - y: array of shape (num_test,) 
        """
        distances = get_distances_zero_loop(self.X_train, X)
        return self.predict_labels(distances)

    def predict_labels(self, distances):
        """
        Given a matrix of distances between test points and training points,
        predict a label for each test point.

        Inputs:
        - distances: array of shape (num_test, num_train), dists[i, j] is Euclidean distance between 
        the ith test point and the jth training point.

        Returns:
        - y:  array of shape (num_test,) 
        """
        num_test = distances.shape[0]
        y_pred = np.zeros(num_test, dtype=int)  # Predictions

        for i in range(num_test):
            # Sort distances and get indices of the k nearest neighbors
            nearest_indices = np.argsort(distances[i])[:self.k]
            
            # Get the labels of the k nearest neighbors
            closest_y = self.y_train[nearest_indices]

            # Predict the label based on the majority vote
            unique, counts = np.unique(closest_y, return_counts=True)
            
            # Get the label with the highest count; in case of tie, choose the smallest label
            y_pred[i] = unique[np.argmax(counts)]

        return y_pred


In [44]:
sk_model = KNeighborsClassifier(n_neighbors=3)
sk_model.fit(X_train3, y_train3)
sk_pred = sk_model.predict(X_test3)
sk_accuracy = accuracy_score(y_test3, sk_pred)

model = KNearestNeighborClassifier(k=3)
model.fit(X_train3, y_train3)
pred = model.predict(X_test3)
model_accuracy = accuracy_score(y_test3, pred)

print("Accuracy scikit-learn:", sk_accuracy)
print("Accuracy gradient descent model :", model_accuracy)
assert sk_accuracy - model_accuracy < 1e-10

Accuracy scikit-learn: 0.9831649831649831
Accuracy gradient descent model : 0.9831649831649831


## cross-validation

In [60]:

num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]


X_train = np.random.rand(100, 5)  
y_train = np.random.randint(0, 2, 100)  # Labels (0 ou 1)


X_train_folds = np.array_split(X_train, num_folds)
y_train_folds = np.array_split(y_train, num_folds)

# K-fold cross-validation
k_to_accuracies = {}

for k in k_choices:
    accuracies = []  
    
    for fold in range(num_folds):
        X_valid = X_train_folds[fold]
        y_valid = y_train_folds[fold]
        
        X_train_cv = np.vstack([X_train_folds[i] for i in range(num_folds) if i != fold])
        y_train_cv = np.hstack([y_train_folds[i] for i in range(num_folds) if i != fold])
        
        # use kNN classifier
        knn = KNearestNeighborClassifier(k=k)
        knn.fit(X_train_cv, y_train_cv)
        
        y_pred = knn.predict(X_valid)
        
        accuracy = np.mean(y_pred == y_valid)
        accuracies.append(accuracy)
    

    k_to_accuracies[k] = accuracies

for k in k_choices:
    print(f"k = {k}: Mean accuracy = {np.mean(k_to_accuracies[k]):.4f}, Accuracies = {k_to_accuracies[k]}")


[array([1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1]), array([1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1]), array([1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0]), array([0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0]), array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0])]
k = 1: Mean accuracy = 0.5300, Accuracies = [np.float64(0.6), np.float64(0.5), np.float64(0.5), np.float64(0.5), np.float64(0.55)]
k = 3: Mean accuracy = 0.5400, Accuracies = [np.float64(0.55), np.float64(0.5), np.float64(0.45), np.float64(0.7), np.float64(0.5)]
k = 5: Mean accuracy = 0.6000, Accuracies = [np.float64(0.55), np.float64(0.45), np.float64(0.65), np.float64(0.8), np.float64(0.55)]
k = 8: Mean accuracy = 0.5100, Accuracies = [np.float64(0.5), np.float64(0.4), np.float64(0.55), np.float64(0.75), np.float64(0.35)]
k = 10: Mean accuracy = 0.5200, Accuracies = [np.float64(0.45), np.float64(0.5), np.float64(0.5), np.float64(0.8), np.float6

In [14]:
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print('k = %d, accuracy = %f' % (k, accuracy))

k = 1, accuracy = 0.700000
k = 1, accuracy = 0.700000
k = 1, accuracy = 0.500000
k = 1, accuracy = 0.450000
k = 1, accuracy = 0.400000
k = 3, accuracy = 0.650000
k = 3, accuracy = 0.750000
k = 3, accuracy = 0.550000
k = 3, accuracy = 0.450000
k = 3, accuracy = 0.450000
k = 5, accuracy = 0.700000
k = 5, accuracy = 0.800000
k = 5, accuracy = 0.700000
k = 5, accuracy = 0.500000
k = 5, accuracy = 0.500000
k = 8, accuracy = 0.600000
k = 8, accuracy = 0.600000
k = 8, accuracy = 0.650000
k = 8, accuracy = 0.550000
k = 8, accuracy = 0.400000
k = 10, accuracy = 0.600000
k = 10, accuracy = 0.800000
k = 10, accuracy = 0.600000
k = 10, accuracy = 0.550000
k = 10, accuracy = 0.350000
k = 12, accuracy = 0.600000
k = 12, accuracy = 0.750000
k = 12, accuracy = 0.650000
k = 12, accuracy = 0.550000
k = 12, accuracy = 0.350000
k = 15, accuracy = 0.550000
k = 15, accuracy = 0.700000
k = 15, accuracy = 0.650000
k = 15, accuracy = 0.500000
k = 15, accuracy = 0.350000
k = 20, accuracy = 0.600000
k = 20, accu