In [None]:
'''
solve MNIST with tree ensemble
'''

In [76]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [2]:
mnist = fetch_mldata("MNIST original")

In [3]:
print(mnist.data.shape)
print(mnist.target.shape)

(70000, 784)
(70000,)


In [4]:
# shuffle
train_size = 60000
X_train, X_test, y_train, y_test = train_test_split(mnist.data, mnist.target,train_size=train_size, shuffle=True)



In [5]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(60000, 784) (10000, 784) (60000,) (10000,)


In [17]:
# just 1 tree
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [18]:
print("Accuracy test: ", accuracy_score(y_test, tree.predict(X_test)))
print("Accuracy train:", accuracy_score(y_train, tree.predict(X_train)))
# it massively overfitted

Accuracy test:  0.877
Accuracy train: 1.0


In [19]:
tree.tree_.node_count

7967

In [21]:
tree_1 = DecisionTreeClassifier(max_leaf_nodes=500)
tree_1.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=500,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [22]:
print("Accuracy test: ", accuracy_score(y_test, tree_1.predict(X_test)))
print("Accuracy train:", accuracy_score(y_train, tree_1.predict(X_train)))
# the test accuracy hasn't go down yet, let's try more regularization

Accuracy test:  0.8691
Accuracy train: 0.895566666667


In [23]:
tree_2 = DecisionTreeClassifier(max_leaf_nodes=100)
tree_2.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=100,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [24]:
print("Accuracy test: ", accuracy_score(y_test, tree_2.predict(X_test)))
print("Accuracy train:", accuracy_score(y_train, tree_2.predict(X_train)))
# accuracy decreased, let's try something in between

Accuracy test:  0.7843
Accuracy train: 0.787933333333


In [25]:
tree_3 = DecisionTreeClassifier(max_leaf_nodes=300)
tree_3.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=300,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [26]:
print("Accuracy test: ", accuracy_score(y_test, tree_3.predict(X_test)))
print("Accuracy train:", accuracy_score(y_train, tree_3.predict(X_train)))
# ok, we'll stick with max_leaf_nodes = 500
# the best accuracy with one tree is 84.3%

Accuracy test:  0.8544
Accuracy train: 0.867816666667


In [25]:
best_tree = DecisionTreeClassifier(max_leaf_nodes=500)

In [27]:
# grow forest from best tree
forest = RandomForestClassifier(n_estimators=20, max_leaf_nodes=500, n_jobs=-1)
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=500,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [28]:
print("Accuracy test: ", accuracy_score(y_test, forest.predict(X_test)))
print("Accuracy train:", accuracy_score(y_train, forest.predict(X_train)))
# that's so much better but it overfits a little bit. Let's add more trees

Accuracy test:  0.9387
Accuracy train: 0.953433333333


In [29]:
forest_1 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=500, n_jobs=-1)
forest_1.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=500,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [30]:
print("Accuracy test: ", accuracy_score(y_test, forest_1.predict(X_test)))
print("Accuracy train:", accuracy_score(y_train, forest_1.predict(X_train)))
# both accuracies increased, how about even more trees

Accuracy test:  0.9483
Accuracy train: 0.960733333333


In [31]:
forest_2 = RandomForestClassifier(n_estimators=500, max_leaf_nodes=500, n_jobs=-1)
forest_2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=500,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [32]:
print("Accuracy test: ", accuracy_score(y_test, forest_2.predict(X_test)))
print("Accuracy train:", accuracy_score(y_train, forest_2.predict(X_train)))
# it stopped increasing, so the best accuracy with random forest is 95.0%

Accuracy test:  0.9497
Accuracy train: 0.961966666667


In [6]:
# stacking trees
n_trees = 50
trees = [DecisionTreeClassifier(max_leaf_nodes=500) for i in range(n_trees)]
for i in range(n_trees):
    print(i)
    trees[i].fit(X_train, y_train)

0
1


KeyboardInterrupt: 

In [None]:
print("Accuracy test: ", accuracy_score(y_test, trees[5].predict(X_test)))
print("Accuracy train:", accuracy_score(y_train, trees[5].predict(X_train)))

In [None]:
# train the blender
X_stack_tree = np.concatenate([trees[i].predict(X_train).reshape([-1,1]) for i in range(n_trees)], axis=1)

In [7]:
# stacking forests
n_forests = 10
forests = [RandomForestClassifier(n_estimators=100, max_leaf_nodes=500, n_jobs=-1) for i in range(n_forests)]
for i in range(n_forests):
    print(i)
    forests[i].fit(X_train, y_train)

0
1
2
3
4
5
6
7
8
9


In [14]:
print("Accuracy test: ", accuracy_score(y_test, forests[7].predict(X_test)))
print("Accuracy train:", accuracy_score(y_train, forests[7].predict(X_train)))


Accuracy test:  0.9472
Accuracy train: 0.960533333333


In [9]:
X_stack_forest = np.concatenate([forests[i].predict_proba(X_train) for i in range(n_forests)], axis=1)

In [60]:
forest_blender = DecisionTreeClassifier(max_leaf_nodes=150)
forest_blender.fit(X_stack_forest, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=100,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [10]:
def stack_predict(estimators, blender, X):
    X_stack = np.concatenate([estimator.predict_proba(X) for estimator in estimators], axis=1)
    return blender.predict(X_stack)

In [62]:
print("Accuracy test: ", accuracy_score(y_test, stack_predict(forests, forest_blender, X_test)))
print("Accuracy train:", accuracy_score(y_train, stack_predict(forests, forest_blender, X_train)))
# after regularization, the best that a tree blender can achieve is 95.1%

Accuracy test:  0.9509
Accuracy train: 0.9722


In [44]:
# logistic blender
forest_blender_1 = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=0.1, n_jobs=-1)
forest_blender_1.fit(X_stack_forest, y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=-1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [45]:
print("Accuracy test: ", accuracy_score(y_test, stack_predict(forests, forest_blender_1, X_test)))
print("Accuracy train:", accuracy_score(y_train, stack_predict(forests, forest_blender_1, X_train)))
# after regularization, the best that a logistic blender can achieve is 95.3%

Accuracy test:  0.9531
Accuracy train: 0.965066666667


In [47]:
# SGD classifier
forest_blender_2 = SGDClassifier()
forest_blender_2.fit(X_stack_forest, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [48]:
print("Accuracy test: ", accuracy_score(y_test, stack_predict(forests, forest_blender_2, X_test)))
print("Accuracy train:", accuracy_score(y_train, stack_predict(forests, forest_blender_2, X_train)))
# SGD won't get to 96 either

Accuracy test:  0.9523
Accuracy train: 0.96555


In [50]:
# let's do an ensemble of 50 forests
n_forests_1 = 50
forests_1 = [RandomForestClassifier(n_estimators=100, max_leaf_nodes=500, n_jobs=-1) for i in range(n_forests_1)]
for i in range(n_forests_1):
    print(i)
    forests_1[i].fit(X_train, y_train)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [51]:
X_stack_forest_1 = np.concatenate([forests_1[i].predict_proba(X_train) for i in range(n_forests_1)], axis=1)

In [52]:
forest_blender_3 = SGDClassifier()
forest_blender_3.fit(X_stack_forest_1, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [53]:
print("Accuracy test: ", accuracy_score(y_test, stack_predict(forests_1, forest_blender_3, X_test)))
print("Accuracy train:", accuracy_score(y_train, stack_predict(forests_1, forest_blender_3, X_train)))

Accuracy test:  0.9532
Accuracy train: 0.965766666667


In [54]:
forest_blender_4 = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=0.1, n_jobs=-1)
forest_blender_4.fit(X_stack_forest_1, y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=-1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [55]:
print("Accuracy test: ", accuracy_score(y_test, stack_predict(forests_1, forest_blender_4, X_test)))
print("Accuracy train:", accuracy_score(y_train, stack_predict(forests_1, forest_blender_4, X_train)))

Accuracy test:  0.9541
Accuracy train: 0.966633333333


In [61]:
forest_blender_5 = DecisionTreeClassifier(max_leaf_nodes=300)
forest_blender_5.fit(X_stack_forest_1, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=300,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [62]:
print("Accuracy test: ", accuracy_score(y_test, stack_predict(forests_1, forest_blender_5, X_test)))
print("Accuracy train:", accuracy_score(y_train, stack_predict(forests_1, forest_blender_5, X_train)))

Accuracy test:  0.9466
Accuracy train: 0.984366666667


In [63]:
X_stack_forest_2 = np.concatenate([forests_1[i].predict(X_train).reshape([-1,1]) for i in range(n_forests_1)], axis=1)

In [71]:
forest_blender_6 = DecisionTreeClassifier(max_leaf_nodes=100)
forest_blender_6.fit(X_stack_forest_2, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=100,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [66]:
def stack_predict_2(estimators, blender, X):
    X_stack = np.concatenate([estimator.predict(X).reshape([-1,1]) for estimator in estimators], axis=1)
    return blender.predict(X_stack)

In [72]:
print("Accuracy test: ", accuracy_score(y_test, stack_predict_2(forests_1, forest_blender_6, X_test)))
print("Accuracy train:", accuracy_score(y_train, stack_predict_2(forests_1, forest_blender_6, X_train)))

Accuracy test:  0.9494
Accuracy train: 0.965183333333


In [74]:
forest_blender_7 = SGDClassifier()
forest_blender_7.fit(X_stack_forest_2, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [75]:
print("Accuracy test: ", accuracy_score(y_test, stack_predict_2(forests_1, forest_blender_7, X_test)))
print("Accuracy train:", accuracy_score(y_train, stack_predict_2(forests_1, forest_blender_7, X_train)))

Accuracy test:  0.2749
Accuracy train: 0.278583333333


In [79]:
# stacking forests only increases the accuracy of one forest for a tiny bit.
# we'll try adaboost
ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5),n_estimators=100)
ada.fit(X_train, y_train)
# ada boost is sequential so it's super slow

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=100, random_state=None)

In [80]:
print("Accuracy test: ", accuracy_score(y_test,ada.predict(X_test)))
print("Accuracy train: ", accuracy_score(y_train,ada.predict(X_train)))

Accuracy test:  0.8692
Accuracy train:  0.889883333333


In [85]:
ada_1 = AdaBoostClassifier(DecisionTreeClassifier(max_leaf_nodes=150),n_estimators=50,learning_rate=0.05)
ada_1.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=150,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.05, n_estimators=50, random_state=None)

In [86]:
print("Accuracy test: ", accuracy_score(y_test,ada_1.predict(X_test)))
print("Accuracy train: ", accuracy_score(y_train,ada_1.predict(X_train)))

Accuracy test:  0.9394
Accuracy train:  0.972066666667
