 1. Try to build a classifier for the MNIST dataset that achieves over 97%
 accuracy on the test set. Hint: the KNeighborsClassifier works quite
 well for this task; you just need to find good hyperparameter values (try
 a grid search on the weights and n_neighbors hyperparameters)

In [5]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [6]:
# load the MNIST dataset
mnist = fetch_openml('mnist_784', version=1, as_frame=False) #28x28
X, y = mnist["data"], mnist["target"] #data picture target number

# convert labels to integers
y = y.astype(np.uint8)

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# baseline KNN classifier
knn_clf = KNeighborsClassifier() # default n_neighbors=5, weights='uniform'
knn_clf.fit(X_train, y_train) # train the classifier
baseline_accuracy = knn_clf.score(X_test, y_test)
print(f"Baseline Accuracy: {baseline_accuracy * 100:.2f}%")  # expected ~96

Baseline Accuracy: 97.01%


In [9]:
# define the hyperparameter grid
param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5, 6]}]

# perform grid search with cross-validation on 10,000 training samples
knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=5) #cv=5 → 5-fold cross-validation (data split into 5 parts, model learned 5 times on different subsets
grid_search.fit(X_train[:10_000], y_train[:10_000])

# show best hyperparameters
print("Best parameters found: ", grid_search.best_params_)
print(f"Best cross-validation score: {grid_search.best_score_ * 100:.2f}%")  # worst because treaning on small part

Best parameters found:  {'n_neighbors': 4, 'weights': 'distance'}
Best cross-validation score: 94.83%
