In [19]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import random, math
from sklearn.tree import DecisionTreeClassifier
from scipy import stats
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                test_size=0.3, shuffle=True, random_state=42)

In [20]:
class RandomForest:
    def __init__(self, B, bootstrap_ratio, with_no_replacement=False):
        self.B = B
        # for without replacement, the bootstrap ratio must be less than 1, otherwise every tree is the samee
        self.bootstrap_ratio = bootstrap_ratio
        self.with_no_replacement = with_no_replacement
        self.tree_params = {'max_depth': 2, 'max_features': 'sqrt'}
        self.models = [DecisionTreeClassifier(**self.tree_params) for _ in range(self.B)]
        
    def fit(self, X_train, y_train):
        m, n = X_train.shape

        #sample size for each tree - how many in each bootstrapped tree
        sample_size = int(self.bootstrap_ratio * len(X_train))

        #holds ALL the data for B trees (sample_size is same as m with bootstrap ratio of 1)
        #3D shape, B trees of shape m, n 
        xsamples = np.zeros((self.B, sample_size, n))
        ysamples = np.zeros((self.B, sample_size))
        
        #OUT OF BAG (samples NOT used for each tree)
        xsamples_oob = []  #use list because length is not known
        ysamples_oob = []
        
        for i in range(self.B):
            oob_idx = []
            idxes = []
            for j in range(sample_size):
                idx = random.randrange(m)
                #for without replacement, as long as idx as already been used, keep rolling for new idx
                if self.with_no_replacement==True:
                    while idx in idxes:
                        idx = random.randrange(m)
                idxes.append(idx)
                #append used index to oob list for now because we use boolean mask later
                oob_idx.append(idx)
                xsamples[i, j, :] = X_train[idx]
                ysamples[i, j] = y_train[idx]
            #list of FALSE of shape X_train.shape[0]
            mask = np.zeros((m), dtype=bool)
            #if TRUE, means that index has been used, so use FALSE for OOB evaluation
            mask[oob_idx] = True
#             print(set(oob_idx))
#             print(mask)
#             print(~mask)
            #~mask is inverse of mask
            xsamples_oob.append(X_train[~mask])
            ysamples_oob.append(y_train[~mask])
#         print('len(xsamples_oob)):',len(xsamples_oob))
#         print('len(ysamples_oob)):',len(ysamples_oob))
        assert(len(xsamples_oob) == self.B)
        assert(len(ysamples_oob) == self.B)
        
        #fitting each estimator
        oob_score = 0
        print('OUT OF BAG VALIDATION SCORES')
        for i, model in enumerate(self.models):
            _X = xsamples[i, :]
            _y = ysamples[i, :]
            model.fit(_X, _y)
            
            #make new _test array with the oob samples
            _X_test = np.asarray(xsamples_oob[i])
            _y_test = np.asarray(ysamples_oob[i])
            yhat = model.predict(_X_test)
            oob_score += accuracy_score(_y_test, yhat)
            
            print(f"Tree {i}", accuracy_score(_y_test, yhat))
        avg_oob_score = oob_score / self.B
        print("AVERAGE OUT OF BAG VALIDATION SCORE")
        print(avg_oob_score)
            
    def predict(self, X): #<---X_test
        #make prediction and return the probabilities
        predictions = np.zeros((self.B, X.shape[0]))
        for i, model in enumerate(self.models):
            yhat = model.predict(X)
            predictions[i, :] = yhat
#             print(stats.mode(predictions)[0][0])
        return stats.mode(predictions)[0][0]

In [21]:
model = RandomForest(B=5, bootstrap_ratio=1, with_no_replacement=False)
model.fit(X_train, y_train)
yhat = model.predict(X_test)
print('\nCLASSIFICATION REPORT')
print(classification_report(y_test, yhat))

OUT OF BAG VALIDATION SCORES
Tree 0 0.875
Tree 1 0.85
Tree 2 0.9090909090909091
Tree 3 0.9714285714285714
Tree 4 0.9444444444444444
AVERAGE OUT OF BAG VALIDATION SCORE
0.909992784992785

CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

