In [33]:
from sklearn.datasets import fetch_mldata 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score

In [2]:
mnist = fetch_mldata('MNIST original')

In [3]:
X, y = mnist["data"], mnist["target"]

In [4]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    mnist.data, mnist.target, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=42)

In [5]:
svm_clf = LinearSVC(random_state = 42)
rf_clf = RandomForestClassifier(random_state = 42, n_jobs = -1)
etc_clf = ExtraTreesClassifier(random_state = 42, n_jobs = -1)

In [6]:
estimators = [svm_clf, rf_clf, etc_clf]

In [7]:
for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train, y_train)

Training the LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)
Training the RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
Training the ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           o

In [15]:
X_val_pred = np.empty((len(X_val), len(estimators)), dtype = np.float32)

In [16]:
X_val_pred.shape

(10000, 3)

In [17]:
for index, estimator in enumerate(estimators):
    X_val_pred[:, index] = estimator.predict(X_val)

In [18]:
X_val_pred

array([[2., 2., 2.],
       [7., 7., 7.],
       [4., 4., 4.],
       ...,
       [4., 4., 4.],
       [9., 9., 9.],
       [4., 4., 4.]], dtype=float32)

In [30]:
random_forest_blender = RandomForestClassifier(random_state = 42, n_estimators = 200, max_leaf_nodes = 20, 
                                               n_jobs = -1, oob_score=True)

In [31]:
random_forest_blender.fit(X_val_pred, y_val)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=20,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

In [32]:
random_forest_blender.oob_score_

0.9558

In [34]:
X_test_pred = np.empty((len(X_test), len(estimators)), dtype = np.float32)

In [35]:
for index, estimator in enumerate(estimators):
    X_test_pred[:, index] = estimator.predict(X_test)

In [38]:
y_pred = random_forest_blender.predict(X_test_pred)

In [39]:
accuracy_score(y_test, y_pred)

0.9492

In [40]:
from sklearn.model_selection import GridSearchCV

In [45]:
param_grid = {'n_estimators': list(range(200,250)), 'max_leaf_nodes':list(range(20,50)), 'min_samples_split': list(range(2,30))}

grid_search = GridSearchCV(random_forest_blender, param_grid, cv= 5, n_jobs = -1)

In [46]:
grid_search.fit(X_val_pred, y_val)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=20,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=True, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249], 'max_leaf_nodes': [2..., 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       s

In [47]:
grid_search.best_params_

{'max_leaf_nodes': 26, 'min_samples_split': 28, 'n_estimators': 216}

In [48]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=26,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=28,
            min_weight_fraction_leaf=0.0, n_estimators=216, n_jobs=-1,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

In [50]:
y_pred = grid_search.best_estimator_.predict(X_test_pred)

In [51]:
accuracy_score(y_test, y_pred)

0.9499