In [1]:
import numpy as np
import pickle
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

In [2]:
def load_data(file):
    data, labels = [], []
    for line in open(file):
        content = list(map(int, line.split(',')))
        data.append(content[1:])
        labels.append(content[0])
    X = np.array(data)
    X = X * 2 / 255 - 1
    Y = np.array(labels)
    return X, Y
def load_dataset(training_set='mnist_train.txt', test_set='mnist_test.txt'):
    training_X, training_Y = load_data(training_set)
    test_X, test_Y = load_data(test_set)
    return training_X, training_Y, test_X, test_Y

In [3]:
training_X, training_Y, test_X, test_Y = load_dataset()

In [4]:
def calculate_accuracy(predictions, labels):
    return 1 - np.count_nonzero(predictions - labels) / float(len(predictions))

In [5]:
classifier = SVC()
classifier.fit(training_X, training_Y)
calculate_accuracy(classifier.predict(test_X), test_Y)

0.922

In [6]:
def cross_validate(X, Y, folds, estimator, verbose=False):
    def generate():
        fold_length = len(X) // folds
        for index in range(folds):
            start = index * fold_length
            end = (index + 1) * fold_length
            yield \
                np.vstack((X[:start], X[end:])), \
                np.hstack((Y[:start], Y[end:])), \
                X[start : end], \
                Y[start : end]
    from copy import deepcopy as copy
    estimators = tuple(copy(estimator) for f in range(folds))
    accuracy = []
    for index, zipped in enumerate(zip(generate(), estimators)):
        fold, e = zipped
        X_fold, Y_fold, validation_X_fold, validation_Y_fold = fold
        e.fit(X_fold, Y_fold)
        accuracy.append(calculate_accuracy(e.predict(validation_X_fold), validation_Y_fold))
        if verbose: print('validation fold %d accuracy %f' % (index, accuracy[-1]))
    optimal_accuracy = max(accuracy)
    return optimal_accuracy, estimators[accuracy.index(optimal_accuracy)]

In [7]:
accuracy, classifier = cross_validate(training_X, training_Y, 5, SVC(), verbose=True)
print('cross validation accuracy', accuracy)
test_accuracy = calculate_accuracy(classifier.predict(test_X), test_Y)
print('test accuracy', test_accuracy)

validation fold 0 accuracy 0.912500
validation fold 1 accuracy 0.915000
validation fold 2 accuracy 0.915000
validation fold 3 accuracy 0.920000
validation fold 4 accuracy 0.922500
cross validation accuracy 0.9225
test accuracy 0.923


In [8]:
N_FEATURES = 28 * 28
C_table = (1, 2, 3, 4)
gamma_multiplier_table = (1, 2, 3, 4)
gamma_table = list(map(lambda multiplier : multiplier / N_FEATURES, gamma_multiplier_table))
for C in C_table:
    for gamma in gamma_table:
        print('C = %f; gamma = %f;' % (C, gamma))
        accuracy, classifier = cross_validate(training_X, training_Y, 5, SVC(C=C, gamma=gamma), verbose=False)
        print('cross validation accuracy', accuracy)
        test_accuracy = calculate_accuracy(classifier.predict(test_X), test_Y)
        print('test accuracy', test_accuracy)

C = 1.000000; gamma = 0.001276;
cross validation accuracy 0.9225
test accuracy 0.923
C = 1.000000; gamma = 0.002551;
cross validation accuracy 0.935
test accuracy 0.931
C = 1.000000; gamma = 0.003827;
cross validation accuracy 0.9425
test accuracy 0.937
C = 1.000000; gamma = 0.005102;
cross validation accuracy 0.95
test accuracy 0.942
C = 2.000000; gamma = 0.001276;
cross validation accuracy 0.9275
test accuracy 0.917
C = 2.000000; gamma = 0.002551;
cross validation accuracy 0.945
test accuracy 0.931
C = 2.000000; gamma = 0.003827;
cross validation accuracy 0.95
test accuracy 0.9390000000000001
C = 2.000000; gamma = 0.005102;
cross validation accuracy 0.955
test accuracy 0.946
C = 3.000000; gamma = 0.001276;
cross validation accuracy 0.935
test accuracy 0.921
C = 3.000000; gamma = 0.002551;
cross validation accuracy 0.95
test accuracy 0.9390000000000001
C = 3.000000; gamma = 0.003827;
cross validation accuracy 0.9525
test accuracy 0.938
C = 3.000000; gamma = 0.005102;
cross validation 

### Optimal Result
    C = 2.000000; gamma = 0.005102.
    cross validation accuracy 0.955
    test accuracy 0.946
    (Official websit of the MNIST dataset reports an test error of 1.4% using gaussian kernel SVM)