In [1]:
from sklearn.datasets import fetch_openml
import numpy as np
mnist = fetch_openml('mnist_784',version = 1, as_frame= False) #don't need a df - this also speeds the fetch process
mnist.keys()
X,y = mnist["data"], mnist["target"]
print(f"X: {X.shape} Y: {y.shape}")
y=y.astype(np.uint8)

#mnist784 is already in random order, so just take the final 10k out of 70 k as test set
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

X: (70000, 784) Y: (70000,)


In [2]:
#small scale training set for model

# SVM classification on MNIST dataset
* One vs rest approach



In [3]:
# #binary classifier targets for digits from 0 to 9
# y_trains = [] 
# y_tests=[]
# for _ in range(0,10):
#     y_trains.append(y_train == _)
#     y_tests.append(y_test == _)
# print("debug")

In [4]:
#do NOT use SVC - O(m^2n) - just rely on LinearSVC - i.e. not using non-linear kernel

# from sklearn.svm import SVC
# from sklearn.preprocessing import StandardScaler
# from sklearn.multiclass import OneVsRestClassifier
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import cross_val_score

# svc_pipeline = Pipeline([
#     ("scaler",StandardScaler()),
#     ("svc",SVC(kernel="linear"))
# ])

# clf = OneVsRestClassifier(svc_pipeline,n_jobs=4).fit(X_train,y_train)
# clf.predict(X_test[0])
# # svc_s = []
# # for _ in range(0,10):
# #     svc_s.append(svc_pipeline.fit(X_train,y_trains[_]))
# # print("done")


In [5]:
from sklearn.svm import LinearSVC #uses OVR by default
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

lsvc_clf = Pipeline([
    ("scaler",StandardScaler()),
    ("lsvc",LinearSVC(random_state=42))
])

#testing on the training set to see how well the model fits.
lsvc_clf.fit(X_train,y_train)
y_pred = lsvc_clf.predict(X_train)
accuracy_score(y_train,y_pred)




0.9214

* Trying with non-linear kernel
* SVC uses OVR and RBF kernel by default
* Train on 10000 samples to speed model selection since up to  Om^3 where m = training instances - LinearSVC is O(m)
* BUT test on full test set

In [6]:
from sklearn.svm import SVC #uses OVR by default
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score


pl_svc = Pipeline ([
    ("scaler",StandardScaler()),
    ("svc", SVC())
]) #ovr and RBF by default

pl_svc.fit(X_train[:10000],y_train[:10000])

y_pred = pl_svc.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.9378


In [7]:
X_train_scaled = StandardScaler().fit_transform(X_train.astype(np.float32))
X_test_scaled = StandardScaler().fit_transform(X_test.astype(np.float32))
trest = SVC().fit(X_train_scaled[:10000],y_train[:10000])
y_pred = trest.predict(X_test_scaled)
print(accuracy_score(y_test,y_pred))

0.9386


In [8]:
trest2 = SVC().fit(X_train[:10000],y_train[:10000])
y_pred = trest2.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.9594


# Use RandomizedCV top find good hyperparameters
* use on very small test dataset

* RBF kernel on 1/6 of training set outperforms Linear on the whole training set


In [11]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
rnd_search_cv = RandomizedSearchCV(SVC(), param_distributions, n_iter=30, verbose=2, cv=3)
rnd_search_cv.fit(X_train_scaled[:1000], y_train[:1000])

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] END ...C=10.015313578091579, gamma=0.006877228971726138; total time=   0.2s
[CV] END ...C=10.015313578091579, gamma=0.006877228971726138; total time=   0.1s
[CV] END ...C=10.015313578091579, gamma=0.006877228971726138; total time=   0.2s
[CV] END .....C=2.652221285239065, gamma=0.04084075367801679; total time=   0.2s
[CV] END .....C=2.652221285239065, gamma=0.04084075367801679; total time=   0.2s
[CV] END .....C=2.652221285239065, gamma=0.04084075367801679; total time=   0.2s
[CV] END .....C=9.605222143937755, gamma=0.05440234752698698; total time=   0.2s
[CV] END .....C=9.605222143937755, gamma=0.05440234752698698; total time=   0.2s
[CV] END .....C=9.605222143937755, gamma=0.05440234752698698; total time=   0.2s
[CV] END ....C=1.406384150661232, gamma=0.005518825731541187; total time=   0.1s
[CV] END ....C=1.406384150661232, gamma=0.005518825731541187; total time=   0.1s
[CV] END ....C=1.406384150661232, gamma=0.005518

RandomizedSearchCV(cv=3, estimator=SVC(), n_iter=30,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019F992E85E0>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019F992C1B50>},
                   verbose=2)

In [14]:
print(rnd_search_cv.best_params_)
y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

{'C': 8.286625146463873, 'gamma': 0.0013982502855165206}


0.8793

Look at test performance on different fractions of the training set
* Increasing level from 1/6 to whole of training set increases test set performance

In [15]:
rnd_search_cv.best_estimator_.fit(X_train_scaled[:10000], y_train[:10000])
y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

0.9464

In [16]:
#And on the 20k set
rnd_search_cv.best_estimator_.fit(X_train_scaled[:20000], y_train[:20000])
y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

0.9607

In [17]:
rnd_search_cv.best_estimator_.fit(X_train_scaled[:30000], y_train[:30000])
y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

0.9648

In [18]:
rnd_search_cv.best_estimator_.fit(X_train_scaled[:40000], y_train[:40000])
y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

0.9677

In [19]:
rnd_search_cv.best_estimator_.fit(X_train_scaled, y_train)
y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

0.972

In [20]:
y_pred_train = rnd_search_cv.best_estimator_.predict(X_train_scaled)
accuracy_score(y_train, y_pred_train)

0.9991

* 97.2% on test set, 99.9% on training set so still overfitting
* cannot run SVC on 2d array

Try fitting on 28x28 rather than 784 linear:

In [4]:
X_2d = X.reshape(70000,28,28)
X_2d_train, X_2d_test = X_2d[:60000], X_2d[60000:]


In [6]:
from sklearn.svm import SVC #uses OVR by default
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score


# pl_svc = Pipeline ([
#     ("scaler",StandardScaler()),
#     ("svc", SVC())
# ]) #ovr and RBF by default

pl_svc = Pipeline ([
    ("svc", SVC())
]) #ovr and RBF by default

pl_svc.fit(X_2d_train[:10000],y_train[:10000])

y_pred_2d = pl_svc.predict(X_2d_test)
print(accuracy_score(y_test,y_pred_2d))


ValueError: Found array with dim 3. Estimator expected <= 2.