In [155]:
import numpy as np
import scipy as sp
from sklearn.base import BaseEstimator
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import cross_val_score

digits = load_digits()


In [156]:
# Normal Decision Tree
tree = DecisionTreeClassifier()
cross_val_score(tree, digits.data, digits.target, cv=10, scoring="accuracy").mean()


0.8274798261949099

In [157]:
# Scikit-learn implementation of Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
cross_val_score(rf, digits.data, digits.target, cv=10, scoring="accuracy").mean()


0.9499099937926753

In [158]:
# Your implementation of Random Forest
class MyRandomForest(BaseEstimator):

    def __init__(self, n_estimators=100):
        self.n_estimators = n_estimators  # number of trees to fit
        self.trees = []                  # list to store the fitted trees

    def fit(self, X, y):
        # fit n_estimators DecisionTreeClassifiers (with max_features="sqrt")
        # on a randomized bootstrap of the data
        for _ in range(self.n_estimators):
            tree = DecisionTreeClassifier(max_features="sqrt")
            idx = np.random.choice(2, len(X))  # does not work... :(
            tree.fit(X[idx], y[idx])
            self.trees.append(tree)
        # hint: use function numpy.random.choice() to generate boostrap sample
        # hint: for loop
        # hint: self.trees.append()

        return self

    def predict(self, X):
        # Use the DecisionTreeClassifiers to predict values
        predictions = np.zeros((len(X), self.n_estimators))
        for i in range(self.n_estimators):
            predictions[:, i] = self.trees[i].predict(X)
        # hint: loop over fitted trees
        # hint: store all predictions in a matrix,
        #       then determine majority vote with function scipy.stats.mode() (see cell below for an example)

        return sp.stats.mode(predictions, axis=1)[0].ravel()


# Test MyRandomForest


In [160]:
mrf = MyRandomForest()

mrf.fit(digits.data, digits.target)


MyRandomForest()

In [161]:
(mrf.predict(digits.data) == digits.target).mean()


0.19532554257095158

In [162]:
cross_val_score(mrf, digits.data, digits.target, cv=10, scoring="accuracy").mean()


0.18252327746741154

In [163]:
# example of computing majority vote
predictions = np.random.choice(2, (20, 5))  # 2 classes, 20 test data points, 5 trees
print(predictions)
sp.stats.mode(predictions, axis=1)[0].ravel()


[[0 0 1 1 0]
 [1 0 0 1 1]
 [1 0 0 1 0]
 [1 1 1 1 0]
 [1 0 1 0 0]
 [0 0 1 1 1]
 [1 0 0 1 1]
 [0 1 1 1 1]
 [0 1 0 1 0]
 [1 1 0 1 1]
 [0 1 0 0 1]
 [0 0 1 1 0]
 [0 0 1 1 0]
 [0 1 1 1 0]
 [0 1 0 0 1]
 [0 1 0 0 1]
 [0 0 0 0 1]
 [0 0 0 1 0]
 [1 0 0 1 0]
 [0 0 0 0 1]]


array([0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0])