In [1]:
import os

MNIST_PATH = os.path.join("datasets", "mnist")

In [2]:
import pandas as pd

def load_mnist_data(filename, mnist_path=MNIST_PATH):
    csv_path = os.path.join(mnist_path, filename)
    return pd.read_csv(csv_path)

In [3]:
train_data = load_mnist_data('train.csv')
test_data = load_mnist_data('test.csv')

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 785 entries, label to pixel783
dtypes: int64(785)
memory usage: 251.5 MB


In [6]:
X_test = test_data[:]
X_test.shape

(28000, 784)

In [7]:
X_train = train_data[:]
X_train.shape

(42000, 785)

In [10]:
y_train = train_data["label"]

In [13]:
y_train.shape

(42000,)

In [14]:
X_train = train_data.drop("label", axis = 1)
X_train.shape

(42000, 784)

In [15]:
y_train.shape

(42000,)

In [16]:
X_test.shape

(28000, 784)

In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5]}]

knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] n_neighbors=3, weights=uniform ..................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ...... n_neighbors=3, weights=uniform, score=0.970, total=14.9min
[CV] n_neighbors=3, weights=uniform ..................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 14.9min remaining:    0.0s


[CV] ...... n_neighbors=3, weights=uniform, score=0.967, total=15.0min
[CV] n_neighbors=3, weights=uniform ..................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 29.8min remaining:    0.0s


[CV] ...... n_neighbors=3, weights=uniform, score=0.963, total=15.3min
[CV] n_neighbors=3, weights=uniform ..................................
[CV] ...... n_neighbors=3, weights=uniform, score=0.964, total=15.1min
[CV] n_neighbors=3, weights=uniform ..................................
[CV] ...... n_neighbors=3, weights=uniform, score=0.969, total=15.3min
[CV] n_neighbors=3, weights=distance .................................
[CV] ..... n_neighbors=3, weights=distance, score=0.970, total=15.4min
[CV] n_neighbors=3, weights=distance .................................
[CV] ..... n_neighbors=3, weights=distance, score=0.969, total=13.7min
[CV] n_neighbors=3, weights=distance .................................
[CV] ..... n_neighbors=3, weights=distance, score=0.964, total= 6.0min
[CV] n_neighbors=3, weights=distance .................................
[CV] ..... n_neighbors=3, weights=distance, score=0.966, total= 6.0min
[CV] n_neighbors=3, weights=distance .................................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 271.3min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid=[{'n_neighbors': [3, 4, 5],
                          'weights': ['uniform', 'distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [22]:
grid_search.best_params_

{'n_neighbors': 4, 'weights': 'distance'}

In [23]:
grid_search.best_score_

0.9691190476190477

In [24]:
final_model_knn = grid_search.best_estimator_

In [26]:
y_pred_knn = final_model_knn.predict(X_test)

### row index start from 1 instead of 0

In [38]:
import numpy as np
X_test_new = np.arange(1,len(X_test) + 1)

In [40]:
submission = pd.DataFrame({'ImageId':X_test_new[:],'Label':y_pred_knn})

#Visualize the first 5 rows
submission.head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3


In [41]:
filename1 = 'mnist_predictions_1.csv'

submission.to_csv(filename1,index=False)

print('Saved file: ' + filename1)

Saved file: mnist_predictions_1.csv
