Note: There are a multitude of print statements and verbosity requests for a better user (me) experience. The prints can be removed obviously; you can modify the verbosity as you please.

In [None]:
import numpy as np

In [None]:
# Import MNIST dataset
from sklearn.datasets import fetch_openml
print("Importing dataset")
mnist = fetch_openml('mnist_784', version=1)
print("Finished importing dataset")

Importing dataset


In [None]:
X, y = mnist["data"], mnist["target"]

In [None]:
# Already split and shuffled, just assign
print("Splitting into training and test set")
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
print("Finished splitting into training and test set")

In [None]:
# "global" variables (don't know best practice for this in python)

# verbose set for running time and debugging purposes; change num to 0 and bool to False for code validation. 
verbose_num=1
verbose_bool=True

# WARNING: Change n_jobs parameter according to the number of processing cores you want to allocate to the task
n_jobs=10

SVM binary classifiers can handle multi-class classification automatically with one-vs-rest strategy


LinearSVC seems to run into convergence issues with the default tolerance. tol changed to 0.1 and dual to false [because there are more instances than features]

In [None]:
# Pipeline (using LinearSVC for speed on large dataset)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

print("Creating Pipeline")
svm_clf = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("LinearSVC", LinearSVC(dual=False, tol=.1, multi_class="ovr", verbose=verbose_num, random_state=42)),
    ], verbose=verbose_bool)

print("Finished creating Pipeline")

In [None]:
# Train on training set
print("Training")
svm_clf.fit(X_train, y_train)
print("Finished training")

In [None]:
# Cross-validation accuracy (runs fast enough on my computer so I didn't bother making it smaller)
from sklearn.model_selection import cross_val_score
print("Cross-validating")
scores = cross_val_score(svm_clf, X_train, y_train, cv=5, n_jobs=n_jobs, scoring="accuracy", verbose=verbose_num)
print("Average score: %.2f" % scores.mean())
print("Stdev: %.2f" % scores.std())
print("Finished cross-validating")

In [None]:
# Predict
from sklearn.metrics import accuracy_score
print("Predicting test set labels")
svm_clf_predictions = svm_clf.predict(X_test)
print("Finished predicting test set labels")
accuracy = accuracy_score(y_test, svm_clf_predictions)
print("Accuracy: %.4f" % accuracy)

Turning verbosity off for RandomizedSearch because it isn't helpful.

In [None]:
# Find best hyperparameters and cross-validate using RandomizedSearchCV
# May or may not take forever
from sklearn.model_selection import RandomizedSearchCV
print("Searching for best hyperparameters")
param_dist = [
             {'C': [.01, .1, 1, 10, 100]},
             {'tol': [0.01, 0.25, 0.50, 0.75, 0.1]},
]

svm_clf = LinearSVC(dual=False, multi_class="ovr", random_state=42)

rnd_search = RandomizedSearchCV(svm_clf, param_rnd, cv=5, n_jobs=n_jobs)
rnd_search.fit(X_train, y_train)
print("Finished search for best hyperparameters")

In [None]:
# Print best hyperparameters and score
print("Best hyperparameters:")
print(rnd_search.best_params_)
print("Best score:")
print(rnd_search.best_score_)

In [None]:
# Predict the test set with these hyperparameters
from sklearn.metrics import accuracy_score
print("Starting prediction of test set with best hyperparameters")
rnd_search_predictions = rnd_search.predict(X_test)
print("Finished prediction of test set")
accuracy = accuracy_score(y_test, rnd_search_predictions)
print("Accuracy: %.4f" % accuracy)

Locally ran it and got accuracy = 0.9182. Seems low, and is almost certainly underfitting; could be that my hyperparameter ranges are bad (not unlikely, as I did them a magnitude apart each) or a linear classifier just isn't complex enough for this (almost certain). As I recall K-neighbors got around 97%, but took much longer.