# Voting Classifier

In [52]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

import numpy as np

RANDOM_STATE = 42

Exercise: _Load the MNIST data and split it into a training set, a validation set, and a test set (e.g., use 50,000 instances for training, 10,000 for validation, and 10,000 for testing)._

In [2]:
X_mnist, y_mnist = fetch_openml(
    "mnist_784", return_X_y=True, as_frame=False, parser="auto"
)

In [3]:
len(X_mnist)

70000

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_mnist, y_mnist, test_size=10000, random_state=RANDOM_STATE)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=10000, random_state=RANDOM_STATE)
print(len(X_train))
print(len(X_val))
print(len(X_test))

50000
10000
10000


Exercise: _Then train various classifiers, such as a Random Forest classifier, an Extra-Trees classifier, and an SVM._

In [5]:
def fit_evaluate_val(clf):
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_val)
    scores = accuracy_score(y_val, predictions)
    print(f"{clf} score is: {scores}")

def fit_evaluate_test(clf):
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    scores = accuracy_score(y_test, predictions)
    print(f"{clf} score is: {scores}")

In [6]:
rf_clf = RandomForestClassifier(random_state=RANDOM_STATE)
fit_evaluate_val(rf_clf)

RandomForestClassifier(random_state=42) score is: 0.9692


In [7]:
et_clf = ExtraTreesClassifier(random_state=RANDOM_STATE)
fit_evaluate_val(et_clf)

ExtraTreesClassifier(random_state=42) score is: 0.9715


In [12]:
# SVC is very slow on that much of data and LinearSVC doesn't provide probability for soft voting
svm_clf = SVC(
    kernel="linear", max_iter=10, probability=True, random_state=RANDOM_STATE
)
fit_evaluate_val(svm_clf)



SVC(kernel='linear', max_iter=10, probability=True, random_state=42) score is: 0.5618


In [15]:
knn_clf = KNeighborsClassifier()
fit_evaluate_test(knn_clf)

KNeighborsClassifier() score is: 0.9672


In [55]:
estimators = [("Random Forest", rf_clf), ("Extra-Tree", et_clf), ("KNN", knn_clf)]

Exercise: _Next, try to combine \[the classifiers\] into an ensemble that outperforms them all on the validation set, using a soft or hard voting classifier._

In [16]:
hard_voting_clf = VotingClassifier(estimators = estimators, voting="hard")
fit_evaluate_val(hard_voting_clf)

VotingClassifier(estimators=[('Random Forest',
                              RandomForestClassifier(random_state=42)),
                             ('Extra-Tree',
                              ExtraTreesClassifier(random_state=42)),
                             ('KNN', KNeighborsClassifier())]) score is: 0.9738


In [None]:
soft_voting_clf = VotingClassifier(estimators = estimators, voting="soft")
fit_evaluate_val(soft_voting_clf)

VotingClassifier(estimators=[('Random Forest',
                              RandomForestClassifier(random_state=42)),
                             ('Extra-Tree',
                              ExtraTreesClassifier(random_state=42)),
                             ('KNN', KNeighborsClassifier())],
                 voting='soft') score is: 0.9757


_Once you have found \[an ensemble that performs better than the individual predictors\], try it on the test set. How much better does it perform compared to the individual classifiers?_

In [20]:
fit_evaluate_test(rf_clf)
fit_evaluate_test(et_clf)
fit_evaluate_test(knn_clf)

fit_evaluate_test(soft_voting_clf)

RandomForestClassifier(random_state=42) score is: 0.9645
ExtraTreesClassifier(random_state=42) score is: 0.9691
KNeighborsClassifier() score is: 0.9672
VotingClassifier(estimators=[('Random Forest',
                              RandomForestClassifier(random_state=42)),
                             ('Extra-Tree',
                              ExtraTreesClassifier(random_state=42)),
                             ('KNN', KNeighborsClassifier())],
                 voting='soft') score is: 0.9726


Exercise: _Run the individual classifiers from the previous exercise to make predictions on the validation set, and create a new training set with the resulting predictions: each training instance is a vector containing the set of predictions from all your classifiers for an image, and the target is the image's class. Train a classifier on this new training set._

In [46]:
def create_blender_training_set(X):
    rf_clf_val_predictions = rf_clf.predict(X)
    et_clf_val_predictions = et_clf.predict(X)
    knn_clf_val_predictions = knn_clf.predict(X)

    predictions_training_set = []

    predictions_training_set.append(rf_clf_val_predictions)
    predictions_training_set.append(et_clf_val_predictions)
    predictions_training_set.append(knn_clf_val_predictions)

    return np.array(predictions_training_set).T

In [47]:
X_stacking_val = create_blender_training_set(X_val)
print(X_stacking_val.shape)

(10000, 3)


In [37]:
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True,
                                            random_state=42)
rnd_forest_blender.fit(X_stacking_val, y_val)

In [38]:
rnd_forest_blender.oob_score_

0.9734

In [40]:
rnd_forest_blender_score = cross_val_score(rnd_forest_blender, X_stacking_val, y_val, cv=3, n_jobs=-1)
rnd_forest_blender_score.mean()

0.9729998794720505

In [44]:
mlp_blender = MLPClassifier(max_iter=1000, random_state=42)
mlp_blender.fit(X_stacking_val, y_val)

In [45]:
mlp_blender_score = cross_val_score(mlp_blender, X_stacking_val, y_val, cv=3, n_jobs=-1)
mlp_blender_score.mean()

0.9706000294090597

Exercise: _Congratulations, you have just trained a blender, and together with the classifiers they form a stacking ensemble! Now let's evaluate the ensemble on the test set. For each image in the test set, make predictions with all your classifiers, then feed the predictions to the blender to get the ensemble's predictions. How does it compare to the voting classifier you trained earlier?_

In [48]:
X_stacking_test = create_blender_training_set(X_test)

In [49]:
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True,
                                            random_state=42)
rnd_forest_blender.fit(X_stacking_test, y_test)

In [50]:
rnd_forest_blender.oob_score_

0.9687

In [51]:
rnd_forest_blender_score = cross_val_score(rnd_forest_blender, X_stacking_test, y_test, cv=3, n_jobs=-1)
rnd_forest_blender_score.mean()

0.9693002593600691

Exercise: _Now try again using a `StackingClassifier` instead: do you get better performance? If so, why?_

In [56]:
stacking_classifier = StackingClassifier(
    estimators,
    final_estimator=RandomForestClassifier(
        n_estimators=200, oob_score=True, random_state=42
    ),
)

In [57]:
stacking_classifier.fit(X_train, y_train).score(X_test, y_test)

0.977