In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml("mnist_784" ,version=1)

In [3]:
X = mnist["data"]
y = mnist["target"]

In [4]:
y = y.astype(np.uint8) # converting string labels to integers

In [5]:
X_train, X_valid, X_test, y_train, y_valid, y_test = X.iloc[:50000], X.iloc[50000:60000], X.iloc[60000:], y.iloc[:50000], y.iloc[50000:60000], y.iloc[60000:]

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score

In [8]:
rf_clf = RandomForestClassifier(max_depth=30, n_estimators=300, n_jobs=-1)
rf_clf.fit(X_train_scaled, y_train)
y_pred = rf_clf.predict(X_valid_scaled)
accuracy_score(y_valid, y_pred)

0.9744

In [9]:
from sklearn.ensemble import ExtraTreesClassifier

et_clf = ExtraTreesClassifier(n_estimators=300, max_depth=30, n_jobs=-1)
et_clf.fit(X_train_scaled, y_train)
y_pred = et_clf.predict(X_valid_scaled)
accuracy_score(y_valid, y_pred)

0.9756

**Note** We use svm grid search model from previously train on mnist

In [10]:
from sklearn.svm import SVC

svc_clf = SVC(C=2.49816047538945, gamma=0.0015227525095137954, probability=True) # default kernel will be 'rbf'
svc_clf.fit(X_train_scaled, y_train)
y_pred = svc_clf.predict(X_valid_scaled)
accuracy_score(y_valid, y_pred)

0.9742

**Voting Classifier**

In [11]:
from sklearn.ensemble import VotingClassifier

estimators = [("rf", rf_clf), ("et", et_clf), ("svc", svc_clf)]
vtg_clf = VotingClassifier(estimators=estimators, voting="hard")
vtg_clf.fit(X_train_scaled, y_train)
y_pred = vtg_clf.predict(X_valid_scaled)
accuracy_score(y_valid, y_pred)

0.9779

In [12]:
from sklearn.ensemble import VotingClassifier

estimators = [("rf", rf_clf), ("et", et_clf), ("svc", svc_clf)]
vtg_clf_soft = VotingClassifier(estimators=estimators, voting="soft")
vtg_clf_soft.fit(X_train_scaled, y_train)
y_pred = vtg_clf_soft.predict(X_valid_scaled)
accuracy_score(y_valid, y_pred)

0.982

In [13]:
y_pred = vtg_clf_soft.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

0.9769

**Stacking and Blender**

In [14]:
rf_clf.fit(X_train_scaled, y_train)
et_clf.fit(X_train_scaled, y_train)

ExtraTreesClassifier(max_depth=30, n_estimators=300, n_jobs=-1)

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin

class MakeBlender(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        y1 = rf_clf.predict(X)
        y2 = et_clf.predict(X)
        y3 = svc_clf.predict(X)
    
        return np.c_[y1, y2, y3]

In [16]:
blender = MakeBlender()
X_blender = blender.fit_transform(X_valid_scaled)

Training a blender

In [17]:
from sklearn.svm import LinearSVC

lsvc_clf = LinearSVC(loss="hinge", C=1)
lsvc_clf.fit(X_blender[:5000], y_valid[:5000])

LinearSVC(C=1, loss='hinge')

In [18]:
y_pred = lsvc_clf.predict(X_blender[5000:])
accuracy_score(y_valid[5000:], y_pred)

0.4046

In [19]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(max_depth=5)
dt_clf.fit(X_blender[:5000], y_valid[:5000])

DecisionTreeClassifier(max_depth=5)

In [20]:
y_pred = dt_clf.predict(X_blender[5000:])
accuracy_score(y_valid[5000:], y_pred)

0.9766

**Observations**: DecisionTree Outperforms the LinearSVC due to categorical nature of independent features(blender)

Performance Measure of Stacking

In [21]:
X_blender = blender.transform(X_test_scaled)
y_pred = dt_clf.predict(X_blender)
accuracy_score(y_test, y_pred)

0.9692

This stacking ensemble does not perform as well as the voting classifier we trained earlier, even as the individual classifier