# Ensemble on MNIST dataset

In [1]:
from sklearn.datasets import fetch_openml

X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 10000, random_state=10)
print(x_train.shape)
print(x_test.shape)

(60000, 784)
(10000, 784)


In [2]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 10000, random_state=10)
print(x_train.shape)
print(x_val.shape)

(50000, 784)
(10000, 784)


## Train classifiers

In [3]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC

forest = RandomForestClassifier(n_estimators=100, random_state=10)
svc = LinearSVC(max_iter=100, random_state=10)
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=10)

estimators = [forest, svc, extra_trees]
for estimator in estimators:
    print('Training ', estimator)
    estimator.fit(x_train, y_train)

Training  RandomForestClassifier(random_state=10)
Training  LinearSVC(max_iter=100, random_state=10)




Training  ExtraTreesClassifier(random_state=10)


In [4]:
for estimator in estimators:
    print('Score for ', estimator, ": ", estimator.score(x_val, y_val))

Score for  RandomForestClassifier(random_state=10) :  0.9692
Score for  LinearSVC(max_iter=100, random_state=10) :  0.8746
Score for  ExtraTreesClassifier(random_state=10) :  0.9738


## Create an ensemble

In [5]:
from sklearn.ensemble import VotingClassifier

name_estimators = [
    ('random_forest', forest),
    ('linear_svc', svc),
    ('extra_trees', extra_trees)
]
ensemble = VotingClassifier(estimators = name_estimators, n_jobs = -1, voting='hard')
ensemble.fit(x_train, y_train)

VotingClassifier(estimators=[('random_forest',
                              RandomForestClassifier(random_state=10)),
                             ('linear_svc',
                              LinearSVC(max_iter=100, random_state=10)),
                             ('extra_trees',
                              ExtraTreesClassifier(random_state=10))],
                 n_jobs=-1)

In [6]:
ensemble.score(x_val, y_val)

0.9695

## Perform stacking

In [8]:
import numpy as np

x_stacking = np.empty((x_val.shape[0], len(estimators)))
y_stacking = y_val.copy()

for i, estimator in enumerate(estimators):
    x_stacking[:,i] = estimator.predict(x_val)

x_stacking

array([[1., 1., 1.],
       [4., 4., 4.],
       [6., 6., 6.],
       ...,
       [6., 6., 6.],
       [2., 2., 2.],
       [8., 8., 8.]])

In [10]:
from sklearn.svm import SVC

blender = SVC()
blender.fit(x_stacking, y_stacking)
blender.score(x_stacking, y_stacking)

0.9697



## Test on test set

In [11]:
x_test_blender = np.empty((x_test.shape[0], len(estimators)))

for i, estimator in enumerate(estimators):
    x_test_blender[:,i] = estimator.predict(x_test)
    
blender.score(x_test_blender, y_test)

0.964