In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import gc
import cv2

In [22]:
# read the dataset
digits = pd.read_csv("mnist_train.csv")
digits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Columns: 785 entries, label to 28x28
dtypes: int64(785)
memory usage: 359.3 MB


In [24]:
# Summarise the counts of 'label' to see how many labels of each digit are present
digits.label.value_counts()

1    6742
7    6265
3    6131
2    5958
9    5949
0    5923
6    5918
8    5851
4    5842
5    5421
Name: label, dtype: int64

In [27]:
# average values/distributions of features
description = digits.describe()
description

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
count,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,...,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0
mean,4.453933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.200433,0.088867,0.045633,0.019283,0.015117,0.002,0.0,0.0,0.0,0.0
std,2.88927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.042472,3.956189,2.839845,1.68677,1.678283,0.3466,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254.0,254.0,253.0,253.0,254.0,62.0,0.0,0.0,0.0,0.0


In [28]:
# Creating training and test sets
# Splitting the data into train and test
X = digits.iloc[:, 1:]
Y = digits.iloc[:, 0]

# Rescaling the features
from sklearn.preprocessing import scale
X = scale(X)

# train test split with train_size=10% and test size=90%
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.10, random_state=101)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(6000, 784)
(54000, 784)
(6000,)
(54000,)


In [29]:
from sklearn import svm
from sklearn import metrics

# an initial SVM model with linear kernel   
svm_linear = svm.SVC(kernel='linear')

# fit
svm_linear.fit(x_train, y_train)

SVC(kernel='linear')

In [30]:
# predict
predictions = svm_linear.predict(x_test)
predictions[:10]

array([7, 4, 9, 6, 1, 1, 8, 0, 3, 2], dtype=int64)

In [31]:
# evaluation: accuracy
# C(i, j) represents the number of points known to be in class i 
# but predicted to be in class j
confusion = metrics.confusion_matrix(y_true = y_test, y_pred = predictions)
confusion

array([[5165,    4,   38,   12,   15,   29,   50,    5,   17,    0],
       [   1, 5946,   22,   12,    7,    8,    4,   11,   24,   12],
       [  71,   67, 4880,   84,   61,   15,   54,   47,   62,   11],
       [  43,   57,  184, 4770,   13,  230,    8,   43,  120,   46],
       [  21,   26,   64,   10, 4824,    8,   23,   19,   14,  257],
       [  82,   56,   54,  244,   63, 4141,   82,   10,  105,   38],
       [  54,   19,   65,    5,   38,   69, 5062,    3,   15,    1],
       [  13,   54,   90,   40,  120,    6,    1, 5130,   11,  194],
       [  44,  176,  104,  192,   34,  190,   42,   29, 4417,   36],
       [  27,   22,   37,   67,  200,   26,    2,  185,   41, 4750]],
      dtype=int64)

In [32]:
# measure accuracy












metrics.accuracy_score(y_true=y_test, y_pred=predictions)

0.9089814814814815

In [33]:
# class-wise accuracy
class_wise = metrics.classification_report(y_true=y_test, y_pred=predictions)
print(class_wise)

              precision    recall  f1-score   support

           0       0.94      0.97      0.95      5335
           1       0.93      0.98      0.95      6047
           2       0.88      0.91      0.90      5352
           3       0.88      0.87      0.87      5514
           4       0.90      0.92      0.91      5266
           5       0.88      0.85      0.86      4875
           6       0.95      0.95      0.95      5331
           7       0.94      0.91      0.92      5659
           8       0.92      0.84      0.88      5264
           9       0.89      0.89      0.89      5357

    accuracy                           0.91     54000
   macro avg       0.91      0.91      0.91     54000
weighted avg       0.91      0.91      0.91     54000



In [34]:
# run gc.collect() (garbage collect) to free up memory
# else, since the dataset is large and SVM is computationally heavy,
# it'll throw a memory error while training
gc.collect()

113

In [35]:
# rbf kernel with other hyperparameters kept to default 
svm_rbf = svm.SVC(kernel='rbf')
svm_rbf.fit(x_train, y_train)

SVC()

In [36]:
# predict
predictions = svm_rbf.predict(x_test)

# accuracy 
print(metrics.accuracy_score(y_true=y_test, y_pred=predictions))

0.927962962962963
