In [6]:
import numpy as np
import scipy as sp
from sklearn.base import BaseEstimator
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import cross_val_score

digits = load_digits()


In [7]:
# Normal Decision Tree
tree = DecisionTreeClassifier()
cross_val_score(tree, digits.data, digits.target, cv=10, scoring="accuracy").mean()


0.8224643078833023

In [8]:
# Scikit-learn implementation of Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
cross_val_score(rf, digits.data, digits.target, cv=10, scoring="accuracy").mean()


0.9482371198013656

In [9]:
# Your implementation of Random Forest
class MyRandomForest(BaseEstimator):

    def __init__(self, n_estimators=100):
        self.n_estimators = n_estimators  # number of trees to fit
        self.trees = []                  # list to store the fitted trees

    def fit(self, X, y):
        # fit n_estimators DecisionTreeClassifiers (with max_features="sqrt")
        # on a randomized bootstrap of the data
        n = len(X)
        for _ in range(self.n_estimators):
            tree = DecisionTreeClassifier(max_features="sqrt")
            idx = np.random.choice(n, n) 
            tree.fit(X[idx,:], y[idx])
            self.trees.append(tree)

        return self

    def predict(self, X):
        # Use the DecisionTreeClassifiers to predict values
        predictions = np.zeros((len(X), self.n_estimators))
        for i in range(self.n_estimators):
            predictions[:, i] = self.trees[i].predict(X)
        return sp.stats.mode(predictions, axis=1)[0].ravel()


# Test MyRandomForest


In [10]:
mrf = MyRandomForest()

mrf.fit(digits.data, digits.target)


MyRandomForest()

In [11]:
(mrf.predict(digits.data) == digits.target).mean()


1.0

In [12]:
cross_val_score(mrf, digits.data, digits.target, cv=10, scoring="accuracy").mean()


0.9515859714463065

In [13]:
# example of computing majority vote
predictions = np.random.choice(2, (20, 5))  # 2 classes, 20 test data points, 5 trees
print(predictions)
sp.stats.mode(predictions, axis=1)[0].ravel()


[[1 1 1 0 0]
 [0 1 1 0 0]
 [0 0 1 1 0]
 [1 1 1 1 1]
 [0 1 1 0 1]
 [1 0 0 1 1]
 [0 0 0 0 0]
 [1 0 0 1 0]
 [1 1 0 1 1]
 [0 1 0 0 1]
 [0 1 1 1 0]
 [1 1 0 1 0]
 [0 0 0 0 1]
 [0 0 1 1 1]
 [0 0 0 1 0]
 [0 0 1 0 1]
 [0 0 1 1 0]
 [1 0 1 1 0]
 [0 1 1 1 1]
 [0 0 1 0 1]]


array([1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0])