In [None]:
import matplotlib.pyplot as matplot
import seaborn as sb
import pandas as pd
import time
from sklearn.svm import SVC
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
X,y = fetch_openml('mnist_784', version=1, return_X_y=True)
y = y.astype(int)
X = X/255

In [None]:
X_train_90, X_test, y_train_90, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)
X_train, X_valid , y_train, y_valid = train_test_split(X_train_90, y_train_90, test_size = 0.25, random_state=0)

print('DATASET')
print(X.shape, y.shape)
print('TRAIN')
print(X_train.shape, y_train.shape)
print('VALIDATION')
print(X_valid.shape, y_valid.shape)
print('TEST')
print(X_test.shape, y_test.shape)

## LINEAR SVC

First, we train a linear SVC without any parameter.

In [None]:
linear_SVC = SVC(kernel = 'linear')
linear_SVC.fit(X_train, y_train)
y_pred = linear_SVC.predict(X_test)
score = accuracy_score(y_test, y_pred)
print('Accuracy = ', score)

In [None]:
cm = confusion_matrix(y_test, y_pred)
matplot.subplots(figsize=(10, 6))
sb.heatmap(cm, annot = True, fmt = 'g')
matplot.xlabel("Predicted")
matplot.ylabel("Actual")
matplot.title("Confusion Matrix")
matplot.show()

Now we tune the hyperparaters of the estimator without cross validation approach.

In [None]:
acc = []
acc_tr = []
coefficient = []
for c in [0.0001,0.001,0.01,0.1,1,10,100,1000,10000]:
    print(c)
    svm = SVC(kernel='linear', C=c)
    svm.fit(X_train, y_train)
    coef = svm.coef_
    
    predict_train = svm.predict(X_train)
    accuracy_train = accuracy_score(predict_train, y_train)
    
    predict_valid = svm.predict(X_valid)
    accuracy_valid = accuracy_score(predict_valid, y_valid)
    
    coefficient.append(coef)
    acc_tr.append(accuracy_train)
    acc.append(accuracy_valid)

In [None]:
c = [0.0001,0.001,0.01,0.1,1,10,100,1000,10000]

matplot.subplots(figsize=(10, 5))
matplot.semilogx(c, acc,'-gD' ,color='red' , label="Validation Accuracy")
matplot.semilogx(c, acc_tr,'-gD' , label="Training Accuracy")
matplot.grid(True)
matplot.xlabel("Cost Parameter C")
matplot.ylabel("Accuracy")
matplot.legend()
matplot.title('Accuracy versus the Cost Parameter C (log-scale)')
matplot.show()

Now, with cross-validation.

In [None]:
accuracies = []
for c in [0.0001,0.001,0.01,0.1,1,10,100,1000,10000]:
    svm = SVC(kernel='linear', C=c)
    scores = cross_val_score(svm, X_valid, y_valid, cv = 10, n_jobs=10, verbose  = 0)
    validation_accuracy = scores.mean()
    print('C: {:8.3f} - Validation accuracy: {:.3f}'.format(c, validation_accuracy))
    accuracies += [[validation_accuracy, c]]

best_accuracy, best_c = max(accuracies)
print('Best C = ',best_c)

Finally, we consider the best model and we fit it.

In [None]:
cv_linear_svc = SVC(kernel='linear', C=best_c)

start_train = time.time()
cv_linear_svc.fit(X_train, y_train)
time_train = time.time() - start_train

start_test = time.time()
y_pred = cv_linear_svc.predict(X_test)
time_test = time.time() - start_test

score = accuracy_score(y_test, y_pred)

print('Training time: ', time_train)
print('Test time: ', time_test)
print('Test accuracy = ', score)

In [None]:
cm = confusion_matrix(y_test, y_pred)
matplot.subplots(figsize=(10, 6))
sb.heatmap(cm, annot = True, fmt = 'g')
matplot.xlabel("Predicted")
matplot.ylabel("Actual")
matplot.title("Confusion Matrix")
matplot.show()

## POLYNOMIAL SVC (degree = 2)

In [None]:
accuracies = []
df = pd.DataFrame(columns = ['c','gamma','validation_accuracy'])
i = 0
for c in [0.0001,0.001,0.01,0.1,1,10,100,1000,10000]:
    for gam in ['scale', 'auto']:
        svm = SVC(C=c, kernel='poly', degree=2, gamma=gam)
        scores = cross_val_score(svm, X_valid, y_valid, cv = 10, n_jobs=50, verbose  = 0)
        validation_accuracy = scores.mean()
        print(f'C: {c} - Gamma : {gam} - Validation accuracy: {validation_accuracy}')
        # accuracies += [[validation_accuracy, c, gam]]

        df.loc[i] = [c, gam, validation_accuracy]
        i += 1

print(df)
# best_accuracy, best_c, best_gamma = max(accuracies)
# print('Best C = ',best_c)
# print('Best gamma = ', best_gamma)

In [None]:
poly_svc = SVC(C = 10, kernel = 'poly', degree=2, gamma = 'scale')

start_train = time.time()
poly_svc.fit(X_train, y_train)
time_train = time.time() - start_train

start_test = time.time()
y_pred = poly_svc.predict(X_test)
time_test = time.time() - start_test

score = accuracy_score(y_test, y_pred)
print('Train time: ', time_train)
print('Test time: ',time_test)
print('Test accuracy = ', score)

In [None]:
cm = confusion_matrix(y_test, y_pred)
matplot.subplots(figsize=(10, 6))
sb.heatmap(cm, annot = True, fmt = 'g')
matplot.xlabel("Predicted")
matplot.ylabel("Actual")
matplot.title("Confusion Matrix")
matplot.show()

## RBF SVC

In [None]:
for c in [0.0001,0.001,0.01,0.1,1,10,100,1000,10000]:
    for gam in ['scale', 'auto']:
        svm = SVC(C=c, kernel='rbf', degree=2, gamma=gam)
        scores = cross_val_score(svm, X_valid, y_valid, cv = 10, n_jobs=50, verbose  = 0)
        validation_accuracy = scores.mean()
        print(f'C: {c} - Gamma : {gam} - Validation accuracy: {validation_accuracy}')