In [159]:
import random
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from joblib import Parallel, delayed

In [160]:
class Tree(object):
    
    def __init__(self):
        self.outcome_probs = 0
    
    def fit(self, X, y, n_features=None, max_depth=None):
        self.outcome_probs = 7
        return self #!

In [161]:
X, y = np.array([1,2]), np.array([3,4])
trees_in = [Tree() for _ in range(10)]
trees_out = Parallel(n_jobs=2)(delayed(Tree.fit)(tree, X, y) for tree in trees_in)
print([tree.outcome_probs for tree in trees_in])
print([tree.outcome_probs for tree in trees_out])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[7, 7, 7, 7, 7, 7, 7, 7, 7, 7]


In [162]:
fit = []
X, y = np.array([1,2]), np.array([3,4])
trees_in = [Tree() for _ in range(10)]

for tree in trees_in:
    # bootstrap
    fit.append(delayed(Tree.fit)(tree, X, y))

trees_out = Parallel(n_jobs=2)(delayed_tree_fit)
print([tree.outcome_probs for tree in trees_in])
print([tree.outcome_probs for tree in trees_out])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[7, 7, 7, 7, 7, 7, 7, 7, 7, 7]


In [163]:
PATH = 'data/'
df = pd.read_csv(PATH+'sonar-all-data.csv', header=None)
df.columns = [f'feat_{col}' if col!=60 else 'target' for col in df.columns]
df['target'] = df['target'].map({'M': 1, 'R': 0})
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target'), df['target'], test_size=0.2, random_state=2020)

In [164]:
def gini_index(x):   
    if len(x) == 0:
        return 0.0
    p = np.bincount(x) / len(x)
    return 1 - np.sum(p*p)

def gini_gain(parent_node, splits):   
    splits_gini = np.sum([gini_index(split)*(len(split)/len(parent_node)) for split in splits])
    return gini_index(parent_node) - splits_gini

def entropy(x):
    if len(x) == 0:
        return 0.0
    p = np.clip(np.bincount(x) / len(x), 1e-15, 1.)
    return -np.sum(p * np.log(p))

def information_gain(parent_node, splits):    
    splits_entropy = np.sum([entropy(split)*(len(split)/len(parent_node)) for split in splits])
    return entropy(parent_node) - splits_entropy

def split(X, y, value):      
    left_mask = X < value
    right_mask = X >= value
    return y[left_mask], y[right_mask]

def split_dataset(X, y, column, value):     
    left_mask = X[:, column] < value
    right_mask = X[:, column] >= value
    left_y, right_y = y[left_mask], y[right_mask]
    left_X, right_X = X[left_mask], X[right_mask]
    return left_X, right_X, left_y, right_y

In [165]:
class Tree(object):

    def __init__(self, criterion=None):
        self.impurity = None
        self.threshold = None
        self.column_index = None
        self.outcome_probs = None
        self.criterion = criterion
        self.left_child = None
        self.right_child = None

    @property
    def is_terminal(self):         
        return not bool(self.left_child and self.right_child)

    def _find_splits(self, X):
        split_values = set()
        x_unique = list(np.unique(X))
        for i in range(1, len(x_unique)):
            average = (x_unique[i - 1] + x_unique[i]) / 2.0
            split_values.add(average)

        return list(split_values)

    def _find_best_split(self, X, y, n_features):
        subset = random.sample(list(range(0, X.shape[1])), n_features)
        max_gain, max_col, max_val = None, None, None
        for column in subset:
            split_values = self._find_splits(X[:, column])
            for value in split_values:
                splits = split(X[:, column], y, value)
                gain = self.criterion(y, splits)
                if (max_gain is None) or (gain > max_gain):
                    max_col, max_val, max_gain = column, value, gain
        return max_col, max_val, max_gain

    def fit(self, X, y, n_features=None, max_depth=None):    
        try:
            if max_depth is not None:
                assert max_depth > 0
                max_depth -= 1

            if n_features is None:
                n_features = X.shape[1]

            column, value, gain = self._find_best_split(X, y, n_features)
            assert gain is not None

            self.column_index = column
            self.threshold = value
            self.impurity = gain

            left_X, right_X, left_target, right_target = split_dataset(X, y, column, value)

            self.left_child = Tree(self.criterion)
            self.left_child.fit(
                left_X, left_target, n_features, max_depth
            )

            self.right_child = Tree(self.criterion)
            self.right_child.fit(
                right_X, right_target, n_features, max_depth
            )
        except AssertionError:
            self.outcome_probs = np.around(np.sum(y) / y.shape[0])

        return self #@!

    def predict_row(self, row):
        
        if not self.is_terminal:
            if row[self.column_index] < self.threshold:
                return self.left_child.predict_row(row)
            else:
                return self.right_child.predict_row(row)
        return self.outcome_probs

    def predict(self, X):
        result = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            result[i] = self.predict_row(X[i, :])
        return result

In [166]:
class RandomForestClassifier(object):
    
    def __init__(self, n_jobs=1, n_estimators=10, max_depth=None, n_features=None, criterion="entropy", bootstrap=True):
        self.n_jobs = n_jobs
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.n_features = n_features
        self.bootstrap = bootstrap
        
        if criterion == "entropy":
            self.criterion = information_gain
        elif criterion == "gini":
            self.criterion = gini_gain
        else:
            raise ValueError(f"Unknown criterion '{criterion}'")
            
        self.trees = [Tree(criterion=self.criterion) for _ in range(n_estimators)]
        
    def _init_data(self, X, y):

        self.size = len(X)
        
        if not isinstance(X, np.ndarray):
            self.X = np.array(X)
        else:
            self.X = X

        if not isinstance(y, np.ndarray):
            self.y = np.array(y)
        else:
            self.y = y
            
    def bootstrap_data(self, size):
        return np.random.randint(size, size=size)
    
    def fit(self, X, y):
     
        if self.n_features is None:
            self.n_features = int(np.sqrt(X.shape[1]))
        elif X.shape[1] < self.n_features:
            raise ValueError(f"'n_features should be <= n_features'")
            
        self._init_data(X, y)
        
        #@!
        fit = []
        for tree in self.trees:
            if self.bootstrap:
                idxs = self.bootstrap_data(self.size)
                X, y = self.X[idxs], self.y[idxs]
            else:
                X, y = self.X, self.y
            if self.n_jobs < 2:
                tree.fit(X, y, self.n_features, self.max_depth)
            else:
                fit.append(delayed(Tree.fit)(tree, X, y, self.n_features, self.max_depth))
        if fit: 
            self.trees = Parallel(n_jobs=self.n_jobs)(fit)
        #@!
            
    def predict(self, X):
            
        if not isinstance(X, np.ndarray):
            X = np.array(X)

        if self.X is not None:
            predictions = np.zeros(len(X))
            for i in range(len(X)):
                row_pred = 0.
                for tree in self.trees:
                    row_pred += tree.predict_row(X[i, :])

                row_pred /= self.n_estimators
                predictions[i] = round(row_pred)
            return predictions  
        else:
            raise ValueError("You should fit a model before `predict`")

In [170]:
%%time
model = RandomForestClassifier(n_jobs=1, n_estimators=10, max_depth=None, n_features=None, criterion="entropy")
model.fit(X_train, y_train)
print(f"Accuracy score is: {accuracy_score(y_test, model.predict(X_test))}")

Accuracy score is: 0.8095238095238095
CPU times: user 6.78 s, sys: 3.74 ms, total: 6.78 s
Wall time: 6.78 s


In [171]:
%%time
model = RandomForestClassifier(n_jobs=2, n_estimators=10, max_depth=None, n_features=None, criterion="entropy")
model.fit(X_train, y_train)
print(f"Accuracy score is: {accuracy_score(y_test, model.predict(X_test))}")

Accuracy score is: 0.8095238095238095
CPU times: user 73.7 ms, sys: 15.9 ms, total: 89.6 ms
Wall time: 3.72 s


In [172]:
%%time
model = RandomForestClassifier(n_jobs=4, n_estimators=10, max_depth=None, n_features=None, criterion="entropy")
model.fit(X_train, y_train)
print(f"Accuracy score is: {accuracy_score(y_test, model.predict(X_test))}")

Accuracy score is: 0.8571428571428571
CPU times: user 158 ms, sys: 36.1 ms, total: 194 ms
Wall time: 3.11 s
