Load the MNIST dataset using fetch_openml(mnist_784) and split it into:
  - training set (50000)
  - validation set (10000)
  - test set (10000)

Create a Stacking classifier using a knn classifier, SVM, and Random Forest as layer 0,
and a logistic regression as layer 1.

Check the scores of the individual classifiers and the Stacking classifier.
Which one is best in this case?

In [1]:
from typing import Tuple, Union

from sklearn import clone
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.utils import Bunch

In [2]:
b: Bunch = fetch_openml('mnist_784', parser="auto")
df = b.get("data")

y = b.get('target')
X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=50000, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=10000, random_state=42)

print(
    "X_train.shape:", X_train.shape,  # X_train.shape: (50000, 784)
    "\nX_test.shape:", X_test.shape,  # X_test.shape: (10000, 784)
    "\nX_val.shape:", X_val.shape,  # X_val.shape: (10000, 784)
)

X_train.shape: (50000, 784) 
X_test.shape: (10000, 784) 
X_val.shape: (10000, 784)


In [4]:
clfs: list[Tuple[str, Union[RandomForestClassifier, KNeighborsClassifier, SVC]]] = [
    ("Random Forest", RandomForestClassifier(n_estimators=100, random_state=42, verbose=1)),
    ("KNN", KNeighborsClassifier(n_neighbors=5)),
    ("SVM", SVC(random_state=42, verbose=1)),
]

# Training

In [10]:
%%time
clfs[0][1].fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CPU times: total: 37.4 s
Wall time: 37.8 s


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   37.6s finished


In [11]:
%%time
clfs[1][1].fit(X_train, y_train)

CPU times: total: 172 ms
Wall time: 170 ms


In [12]:
%%time
clfs[2][1].fit(X_train, y_train)

[LibSVM]CPU times: total: 2min 24s
Wall time: 2min 35s


# Test/score

In [13]:
%%time
clfs[0][1].score(X_val, y_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CPU times: total: 391 ms
Wall time: 446 ms


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished


0.9677

In [14]:
%%time
clfs[1][1].score(X_val, y_val)

CPU times: total: 44.6 s
Wall time: 18.9 s


0.9691

In [15]:
%%time
clfs[2][1].score(X_val, y_val)

CPU times: total: 1min 15s
Wall time: 1min 20s


0.9764

# Stacking Classifier

## Creation

In [16]:
stacking_clf = StackingClassifier(
    estimators=[(clf[0], clone(clf[1])) for clf in clfs],
    final_estimator=LogisticRegression(max_iter=50000),
    verbose=2,
    n_jobs=-1,
)

## Training

In [17]:
stacking_clf.fit(X_train, y_train)

## Test/score

In [18]:
stacking_clf_pred = stacking_clf.predict(X_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished


In [27]:
accuracy_score(stacking_clf_pred, y_val)

0.9793

In [28]:
stacking_clf.score(X_test, y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished


0.9802

## Dump

In [29]:
import joblib

In [31]:
joblib.dump(stacking_clf, "../../dump/05/03/stacking-classifier.joblib")

['./dump/05/03/stacking-classifier.joblib']