In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml("mnist_784" ,version=1)

In [3]:
X = mnist["data"]
y = mnist["target"]

In [4]:
y = y.astype(np.uint8) # converting string labels to integers

In [8]:
X_train, X_valid, X_test, y_train, y_valid, y_test = X.iloc[:50000], X.iloc[50000:60000], X.iloc[60000:], y.iloc[:50000], y.iloc[50000:60000], y.iloc[60000:]

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score

param_grid = [{"n_estimators": [100, 200, 300], 
              "max_depth": [10, 20, 30]}]

rf_clf = RandomForestClassifier(n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid, cv=3, scoring="accuracy", verbose=2)
grid_cv.fit(X_train[:5000], y_train[:5000])

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] END .....................max_depth=10, n_estimators=100; total time=   2.3s
[CV] END .....................max_depth=10, n_estimators=100; total time=   1.2s
[CV] END .....................max_depth=10, n_estimators=100; total time=   1.0s
[CV] END .....................max_depth=10, n_estimators=200; total time=   1.6s
[CV] END .....................max_depth=10, n_estimators=200; total time=   1.6s
[CV] END .....................max_depth=10, n_estimators=200; total time=   1.6s
[CV] END .....................max_depth=10, n_estimators=300; total time=   2.3s
[CV] END .....................max_depth=10, n_estimators=300; total time=   2.2s
[CV] END .....................max_depth=10, n_estimators=300; total time=   2.3s
[CV] END .....................max_depth=20, n_estimators=100; total time=   0.9s
[CV] END .....................max_depth=20, n_estimators=100; total time=   1.0s
[CV] END .....................max_depth=20, n_est

GridSearchCV(cv=3, estimator=RandomForestClassifier(n_jobs=-1),
             param_grid=[{'max_depth': [10, 20, 30],
                          'n_estimators': [100, 200, 300]}],
             scoring='accuracy', verbose=2)

In [16]:
grid_cv.best_estimator_

RandomForestClassifier(max_depth=30, n_estimators=300, n_jobs=-1)

In [18]:
final_model = grid_cv.best_estimator_
cross_val_score(final_model, X_train, y_train, cv=3, scoring="accuracy", verbose=2)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  34.7s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   34.8s remaining:    0.0s


[CV] END .................................................... total time=  34.4s
[CV] END .................................................... total time=  34.5s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.7min finished


array([0.9649607 , 0.96298074, 0.9648986 ])

In [19]:
from sklearn.ensemble import ExtraTreesClassifier

et_clf = ExtraTreesClassifier(n_estimators=300, max_depth=30)
cross_val_score(et_clf, X_train, y_train, cv=3, scoring="accuracy", verbose=2)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time= 1.1min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min remaining:    0.0s


[CV] END .................................................... total time= 1.1min
[CV] END .................................................... total time= 1.1min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.4min finished


array([0.9698806 , 0.96640067, 0.96639866])

**Note** We use svm grid search model from previously train on mnist

In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

In [25]:
from sklearn.svm import SVC

svc_clf = SVC(C=2.49816047538945, gamma=0.0015227525095137954, probability=True)
svc_clf.fit(X_train_scaled, y_train)
y_pred = svc_clf.predict(X_valid_scaled)
accuracy_score(y_valid, y_pred)

0.9742

In [24]:
from sklearn.ensemble import VotingClassifier

estimators = [("rf", rf_clf), ("et", et_clf), ("svc", svc_clf)]
vtg_clf = VotingClassifier(estimators=estimators, voting="hard")
vtg_clf.fit(X_train_scaled, y_train)
y_pred = vtg_clf.predict(X_valid_scaled)
accuracy_score(y_valid, y_pred)

0.9776

In [None]:
from sklearn.ensemble import VotingClassifier

estimators = [("rf", rf_clf), ("et", et_clf), ("svc", svc_clf)]
vtg_clf_soft = VotingClassifier(estimators=estimators, voting="soft")
vtg_clf_soft.fit(X_train_scaled, y_train)
y_pred = vtg_clf_soft.predict(X_valid_scaled)
accuracy_score(y_valid, y_pred)