### MNIST Digits - Classification Using KNN


In [2]:
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

import torchvision

train_dataset = torchvision.datasets.MNIST(root='../../data/',
                                           train=True,
                                           download=True)

test_dataset = torchvision.datasets.MNIST(root='../../data/',
                                          train=False)

In [3]:

training_data = train_dataset.train_data.numpy()[:5000].reshape(5000, -1)
# (5000, 28, 28) -> (5000, 784)
training_label = train_dataset.train_labels[:5000].numpy()

test_data = test_dataset.test_data.numpy()[:5000].reshape(5000, -1)
test_label = test_dataset.test_labels[:5000].numpy()



In [4]:
#Print training data size
print('Training data size: ', training_data.shape)
print('Training data label size:', training_label.shape)
print('Training data size: ', test_data.shape)
print('Training data label size:', test_label.shape)

Training data size:  (5000, 784)
Training data label size: (5000,)
Training data size:  (5000, 784)
Training data label size: (5000,)


In [5]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

# train the classifier
knn.fit(training_data, training_label)


In [6]:
# Accuracy
from sklearn import metrics
y_pred = knn.predict(test_data)

print("Accuracy without best param:", metrics.accuracy_score(y_true=test_label, y_pred=y_pred), "\n")


Accuracy without best param: 0.906 



Find the best parameters of the model

In [7]:
# creating a KFold object with 5 splits
folds = KFold(n_splits = 5, shuffle = True, random_state = 10)


hyper_params = [
    {"weights": ["uniform", "distance"],
     "n_neighbors": [2, 3, 5, 7]
    }
]

# set up GridSearchCV()
model_cv = GridSearchCV(estimator = knn,
                        param_grid = hyper_params,
                        scoring= 'accuracy',
                        cv = folds,
                        verbose = 1,
                        return_train_score=True)


In [8]:
# fit the model
model_cv.fit(training_data, training_label)

# printing the optimal accuracy score and hyperparameters
best_score = model_cv.best_score_
best_hyperparams = model_cv.best_params_

print("The best test score is {0} corresponding to hyperparameters {1}".format(best_score, best_hyperparams))


Fitting 5 folds for each of 8 candidates, totalling 40 fits
The best test score is 0.942 corresponding to hyperparameters {'n_neighbors': 3, 'weights': 'distance'}
