# Training an SVM classifier on the MNIST dataset

In [1]:
## importing imp libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml


In [2]:
## loading mnist dataset
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'details', 'categories', 'url'])

In [3]:
X = mnist.data
y = mnist.target

In [10]:
## splitting training and testing data
X_train = X[:6000]
y_train = y[:6000]
X_test =X[6000:7000]
y_test =y[6000:7000]

Here, I am taking only 7000 datasets because its LinearSVC is taking too long time to train

In [11]:
print("X_train, X_test:", X_train.shape, X_test.shape)
print("y_train, y_test:", y_train.shape, y_test.shape)

X_train, X_test: (6000, 784) (1000, 784)
y_train, y_test: (6000,) (1000,)


As, we know that most of the training algorithms are highly sensitive to the order of training instances. Therefore we are required to suffle training data

In [7]:
## shuffling the training data
np.random.seed(42)
rnd_idx = np.random.permutation(600)

X_train = X_train[rnd_idx]
y_train = y_train[rnd_idx]

## Linear SVM classifier

It will automatically use the One-vs-All (also called One-vs-the-Rest, OvR) strategy, so there's nothing special we need to do.

In [12]:
from sklearn.svm import LinearSVC
lin_clf = LinearSVC(random_state=42)
lin_clf.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
          verbose=0)

### Predictions on training set and measuring the accuracy


Let's make predictions on the training set and measure the accuracy (we don't want to measure it on the test set yet, since we have not selected and trained the final model yet):

In [13]:
from sklearn.metrics import accuracy_score

y_pred = lin_clf.predict(X_train)
accuracy_score(y_train, y_pred)

0.9826666666666667

## Standardizing the training and testing data


In [15]:
## Standardizing the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float32))
X_test_scaled = scaler.transform(X_test.astype(np.float32))

### Again, training and fitting the data in LinearSVC

In [16]:
lin_clf = LinearSVC(random_state=42)
lin_clf.fit(X_train_scaled, y_train)






LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
          verbose=0)

In [17]:
y_pred_scaled = lin_clf.predict(X_train_scaled)
accuracy_score(y_train, y_pred_scaled)

0.9971666666666666

That's much better (we cut the error rate in two), but still not great at all for MNIST. If we want to use an SVM, we will have to use a kernel. Let's try an SVC with an RBF kernel (the default).

### Training with SVC with an RBF kernel

In [22]:
from sklearn.svm import SVC

svm_clf = SVC(decision_function_shape = "ovr", gamma ='auto')
svm_clf.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [23]:
y_pred_SVC = svm_clf.predict(X_train_scaled)
accuracy_score(y_train, y_pred_SVC)


0.9806666666666667

### Now, let's tune the hyperparameters by doing a randomized search with cross validation.

In [24]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform


In [25]:
param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
rnd_search_cv = RandomizedSearchCV(svm_clf, param_distributions, n_iter=10, verbose=2, cv=3)
rnd_search_cv.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=9.369638742373738, gamma=0.05416732770780322 ..................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ... C=9.369638742373738, gamma=0.05416732770780322, total=  40.2s
[CV] C=9.369638742373738, gamma=0.05416732770780322 ..................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   40.1s remaining:    0.0s


[CV] ... C=9.369638742373738, gamma=0.05416732770780322, total=  39.5s
[CV] C=9.369638742373738, gamma=0.05416732770780322 ..................
[CV] ... C=9.369638742373738, gamma=0.05416732770780322, total=  38.5s
[CV] C=9.38480763763985, gamma=0.007115131812629816 ..................
[CV] ... C=9.38480763763985, gamma=0.007115131812629816, total=  32.7s
[CV] C=9.38480763763985, gamma=0.007115131812629816 ..................
[CV] ... C=9.38480763763985, gamma=0.007115131812629816, total=  34.1s
[CV] C=9.38480763763985, gamma=0.007115131812629816 ..................
[CV] ... C=9.38480763763985, gamma=0.007115131812629816, total=  32.6s
[CV] C=3.2257641757103053, gamma=0.006213026470310396 ................
[CV] . C=3.2257641757103053, gamma=0.006213026470310396, total=  30.6s
[CV] C=3.2257641757103053, gamma=0.006213026470310396 ................
[CV] . C=3.2257641757103053, gamma=0.006213026470310396, total=  31.3s
[CV] C=3.2257641757103053, gamma=0.006213026470310396 ................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 14.5min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=SVC(C=1.0, cache_size=200, class_weight=None,
                                 coef0=0.0, decision_function_shape='ovr',
                                 degree=3, gamma='auto', kernel='rbf',
                                 max_iter=-1, probability=False,
                                 random_state=None, shrinking=True, tol=0.001,
                                 verbose=False),
                   iid='warn', n_iter=10, n_jobs=None,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001E927690E88>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001E91B596A48>},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=2)

In [26]:
rnd_search_cv.best_estimator_


SVC(C=9.918969087767767, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001964308262295563,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [27]:
rnd_search_cv.best_score_


0.9251666666666667

In [28]:
rnd_search_cv.best_estimator_.fit(X_train_scaled, y_train)


SVC(C=9.918969087767767, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001964308262295563,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [29]:
y_pred = rnd_search_cv.best_estimator_.predict(X_train_scaled)
accuracy_score(y_train, y_pred)

1.0

Not too bad, but apparently the model is overfitting slightly. It's tempting to tweak the hyperparameters a bit more (e.g. decreasing C and/or gamma), but we would run the risk of overfitting the test set.

