In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [2]:
# Get the mnist data
mnist = datasets.fetch_openml('mnist_784', version=1, cache=True)
X, y = mnist["data"], mnist["target"]

In [3]:
# Split data into three: 10k test, 10k validation, 50k training
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, random_state=42, test_size=10000
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, random_state=42, test_size=10000
)

In [4]:
X_train.shape

(50000, 784)

# Train various classifiers:
- random forest
- extra trees
- svm

In [5]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [6]:
et_clf = ExtraTreesClassifier()
et_clf.fit(X_train, y_train)



ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [7]:
sv_clf = SVC()
sv_clf.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

# Save models

In [8]:
from joblib import dump, load

In [10]:
dump(rf_clf, "rf_clf.joblib")
dump(et_clf, "et_clf.joblib")
dump(sv_clf, "sv_clf.joblib")

['sv_clf.joblib']

# Load models

In [16]:
rf_clf = load("rf_clf.joblib")
et_clf = load("et_clf.joblib")
sv_clf = load("sv_clf.joblib")

# Compare scores of individual classifiers

In [None]:
y_pred = rf

# Combine into ensemble
- try both soft and hard voting

In [None]:
# Try best on the test set