In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml("mnist_784" ,version=1)

In [3]:
X = mnist["data"]
y = mnist["target"]

In [4]:
y = y.astype(np.uint8) # converting string labels to integers

In [5]:
X_train, X_test, y_train, y_test = X.iloc[:60000], X.iloc[60000:], y.iloc[:60000], y.iloc[60000:]

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [10]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

svm_clf = SVC(kernel="linear", C=1.0) # we can also use LinearSVM instead
svm_clf.fit(X_train_scaled, y_train)
y_pred = svm_clf.predict(X_train_scaled)
sum(y_pred==y_train)/len(y_train)

0.9821

In [13]:
cross_val_score(svm_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

array([0.9154, 0.9172, 0.9185])

##### Observations
    - Cross validation accuracy is reduced arround 91.50%. Clearly there is overfitting issues with model.
    - To overcome overfitting we should reduce the value of "C". GridSearch is option for that

In [14]:
from sklearn.model_selection import GridSearchCV

svm_clf = SVC(kernel="linear")
param_grid = [{"C": np.random.uniform(0.5, 1, 3)}]
grid_cv = GridSearchCV(svm_clf, param_grid, cv=3, scoring="accuracy")
grid_cv.fit(X_train_scaled, y_train)

GridSearchCV(cv=3, estimator=SVC(kernel='linear'),
             param_grid=[{'C': array([0.55252955, 0.51938365, 0.71316527])}],
             scoring='accuracy')

In [15]:
grid_cv.best_params_

{'C': 0.5193836540260357}

In [16]:
grid_cv.best_estimator_

SVC(C=0.5193836540260357, kernel='linear')

##### Note:
    - Due to more time complexity we will consider fue intances to GridSearch and cross validations

In [70]:
np.random.seed(42)
param_grid = [{"C": np.random.uniform(0, 0.5, 3)}]
grid_cv = GridSearchCV(svm_clf, param_grid, cv=3, scoring="accuracy")
grid_cv.fit(X_train_scaled[:3000], y_train[:3000])

GridSearchCV(cv=3, estimator=SVC(kernel='linear'),
             param_grid=[{'C': array([0.18727006, 0.47535715, 0.36599697])}],
             scoring='accuracy')

In [71]:
grid_cv.best_params_

{'C': 0.18727005942368125}

In [72]:
grid_cv.best_estimator_

SVC(C=0.18727005942368125, kernel='linear')

#### Observations
    - Here with linear kernal very less regularization is required do we found the lowest value of C. If we aproch more towards the zero then GridSearch also prefere the lesser value 

In [76]:
np.random.seed(42)
param_grid = [{"C": np.random.uniform(0, 0.1, 3)}]
grid_cv = GridSearchCV(svm_clf, param_grid, cv=3, scoring="accuracy")
grid_cv.fit(X_train_scaled[:5000], y_train[:5000])

GridSearchCV(cv=3, estimator=SVC(kernel='linear'),
             param_grid=[{'C': array([0.03745401, 0.09507143, 0.07319939])}],
             scoring='accuracy')

In [77]:
grid_cv.best_params_

{'C': 0.03745401188473625}

In [78]:
grid_cv.best_estimator_

SVC(C=0.03745401188473625, kernel='linear')

In [79]:
best_svm_model = grid_cv.best_estimator_
cross_val_score(best_svm_model, X_train_scaled[:10000], y_train[:10000], cv=3, scoring="accuracy")

array([0.92111578, 0.92439244, 0.90669067])

##### Observations
    - As reducing the regularization parameter the accuracy increases by 1%.
    - If further grid search is done with "C", we can improve accuracy by some value

In [82]:
svm_clf = SVC(kernel="rbf", gamma="auto") # rbf is default kernel in SVC
svm_clf.fit(X_train_scaled[:10000], y_train[:10000])
y_pred = svm_clf.predict(X_train_scaled)
accuracy_score(y_train, y_pred)

0.9449166666666666

##### Observations:
    Overall quite good accuracy with rbf kernel, though we have used only 16% of the data to train the model. If we use more instances then we can have more accuracy 
    
    Tunning hyperparameter will be more accurate then this

In [107]:
np.random.seed(42)
svm_clf = SVC() # rbf is default kernel in svc
param_grid = [{"C": np.random.uniform(1, 10, 5),
              "gamma": np.random.uniform(0.001, 0.1, 5)}]
grid_cv = GridSearchCV(svm_clf, param_grid, cv=3, scoring="accuracy")
grid_cv.fit(X_train_scaled[:2500], y_train[:2500])

GridSearchCV(cv=3, estimator=SVC(),
             param_grid=[{'C': array([4.37086107, 9.55642876, 7.58794548, 6.38792636, 2.40416776]),
                          'gamma': array([0.01644346, 0.00675028, 0.08675144, 0.06051039, 0.07109919])}],
             scoring='accuracy')

In [108]:
grid_cv.best_estimator_

SVC(C=4.370861069626263, gamma=0.006750277604651747)

In [109]:
best_model = grid_cv.best_estimator_
best_model.fit(X_train_scaled[:10000], y_train[:10000])
y_pred = best_model.predict(X_train_scaled) 
accuracy_score(y_train, y_pred)

0.8819

More fine grid search

In [111]:
np.random.seed(42)
svm_clf = SVC() # rbf is default kernel in svc
param_grid = [{"C": np.random.uniform(1, 5, 5),
              "gamma": np.random.uniform(0.001, 0.01, 5)}]
grid_cv = GridSearchCV(svm_clf, param_grid, cv=3, scoring="accuracy")
grid_cv.fit(X_train_scaled[:2500], y_train[:2500])

GridSearchCV(cv=3, estimator=SVC(),
             param_grid=[{'C': array([2.49816048, 4.80285723, 3.92797577, 3.39463394, 1.62407456]),
                          'gamma': array([0.00240395, 0.00152275, 0.00879559, 0.00641004, 0.00737265])}],
             scoring='accuracy')

In [112]:
grid_cv.best_estimator_

SVC(C=2.49816047538945, gamma=0.0015227525095137954)

In [113]:
best_model = grid_cv.best_estimator_
best_model.fit(X_train_scaled[:10000], y_train[:10000])
y_pred = best_model.predict(X_train_scaled) 
accuracy_score(y_train, y_pred)

0.9528833333333333

##### Observations
    Model is trained with only 16% of the data, if it is trained with more training data instaces then it can produce more accuracy