In [1]:
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
import multiprocessing

In [2]:
class KNN:
    
    def __init__(self, k, data_size=None):
        self.k = k
        self.data_size = data_size
    
    def fit(self, X, y):
        if self.data_size is not None:
            X = X[:self.data_size]
            y = y[:self.data_size]
        self.X_train = X
        self.y_train = y
    
    def predict(self, X):
        num_test = X.shape[0]
        y_pred = np.zeros(num_test, dtype=self.y_train.dtype)
        num_cores = multiprocessing.cpu_count()
    
        def get_k_nearest_labels(i):
            distances = np.sum((self.X_train - X[i, :])**2, axis=1)
            sorted_indices = np.argsort(distances)
            k_nearest_labels = self.y_train[sorted_indices[:self.k]]
            return np.bincount(k_nearest_labels).argmax()
    
        y_pred = Parallel(n_jobs=num_cores)(delayed(get_k_nearest_labels)(i) for i in range(num_test))
    
        return np.array(y_pred)

In [3]:
# Read the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Extract the labels from the data
train_labels = train_data['label'].values
test_labels = test_data['label'].values

# Normalize the pixel values
train_data = (train_data.iloc[:, 1:].values / 255.0).astype(np.float32)
test_data = (test_data.iloc[:, 1:].values / 255.0).astype(np.float32)


In [4]:
# Define a list of k values to try
k_values = [1, 3, 5, 7, 9]

# Perform k-fold cross validation to select the best k value
num_folds = 5
fold_size = train_data.shape[0] // num_folds
accuracies = []

for k in k_values:
    knn = KNN(k=k, data_size=1000) # Use first 1000 samples for training
    fold_accuracies = []
    
    for i in range(num_folds):
        # Split the data into training and validation sets
        start = i * fold_size
        end = (i + 1) * fold_size
        val_data = train_data[start:end]
        val_labels = train_labels[start:end]
        train_data_fold = np.concatenate([train_data[:start], train_data[end:]])
        train_labels_fold = np.concatenate([train_labels[:start], train_labels[end:]])
        
        # Train the model on the training data
        knn.fit(train_data_fold, train_labels_fold)
        
        # Evaluate the model on the validation data
        y_val_pred = knn.predict(val_data)
        fold_accuracy = np.mean(y_val_pred == val_labels)
        fold_accuracies.append(fold_accuracy)
    
    # Compute the average accuracy across all folds for this k value
    mean_accuracy = np.mean(fold_accuracies)
    accuracies.append(mean_accuracy)

# Select the best k value based on the highest validation accuracy
best_k = k_values[np.argmax(accuracies)]
print("Best k value:", best_k)

Best k value: 1


In [5]:
# Train the model on the full training data using the best k value
knn = KNN(k=best_k)
knn.fit(train_data, train_labels)

# Use the model to make predictions on the test data
y_pred = knn.predict(test_data)

# Evaluate the accuracy of the model
accuracy = np.mean(y_pred == test_labels)
print("Accuracy:", accuracy)

Accuracy: 0.96175
