In [52]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import numpy as np

In [53]:
iris = load_iris()

In [54]:
target = iris.target
data = iris.data

In [55]:
class DecisionNode:
    def __init__(self, data, y, f_index, f_value, right_node, left_node, is_leaf=False):
        self.data = data
        self.y = y
        self.right_node = right_node
        self.left_node = left_node
        self.f_index = f_index
        self.f_value = f_value
        self.is_leaf = is_leaf

In [56]:
class DecisionTree:
    def __init__(self, max_depth=5, loss="entropy"):
        self.depth = 0
        self.max_depth = max_depth
        if loss == "entropy":
            self.loss = self.entropy
        else:
            self.loss = self.gini
    
    @staticmethod
    def entropy(y):
        prob = np.unique(y, return_counts = True)[1] / len(y)
        return -sum(np.log(prob) * prob)
    
    @staticmethod
    def gini(y):
        prob = np.unique(y, return_counts=True)[1] / len(y)
        return sum(prob * (1 - prob))
    
    def train(self, data, y):
        self.tree = self.iterate(data, y)
        
    def iterate(self, data, y):
        h = self.loss(y)
        best_f, best_value = None, None
        for f in range(len(data[0])):
            for f_value in data[:,f]:
                h1 = self.loss(y[data[:,f] > f_value])
                h2 = self.loss(y[data[:,f] <= f_value])
                if h1 + h2 < h:
                    h = h1 + h2
                    best_value = f_value
                    best_f = f
                    
            best_value
        best_f
        self.depth += 1
        if best_f is None or self.depth == self.max_depth:
            return DecisionNode(data, y, None, None, None, None , is_leaf = True)
        data1 = data[data[:,best_f] > best_value ]
        y1 = y[data[:,best_f] > best_value ]
        data2 = data[data[:,best_f] <= best_value ]
        y2 = y[data[:,best_f] <= best_value ]
        return DecisionNode(data, y, best_f, best_value, right_node = self.iterate(data1, y1),
                           left_node = self.iterate(data2, y2), is_leaf = False)
    
    def predict_datum(self, test_datum):
        node = self.tree

        while True:
            if node.is_leaf:
                return np.unique(node.y)[np.unique(node.y, return_counts = True)[1].argmax()]
                
            if test_datum[node.f_index] > node.f_value:
                node = node.right_node
            else:
                node = node.left_node
                
    def predict(self, test_data):
        if test_data.size == test_data.shape[0]:
            return self.predict_datum(test_data)
        return np.array([self.predict_datum(datum) for datum in test_data])
    


In [57]:
class RandomForest:
    def __init__(self, n, max_depth=5, loss='entropy'):
        self.n = n
        self.loss = loss
        self.max_depth = max_depth
        
    def train(self, data, y):
        self.forest = []
        for i in range(self.n):
            index = np.random. randint(0,len(data), len(data))
            data_train = data[index]
            y_train = y[index]
            self.forest.append(DecisionTree(max_depth=self.max_depth,loss=self.loss))
            self.forest[i].train(data_train,y_train)
            
    def predict_datum(self, test_datum):
        self.predict_tree = []
        for i in range(self.n):
            self.predict_tree.append(self.forest[i].predict(test_datum))
        return np.unique(np.array(self.predict_tree))[np.unique(np.array(self.predict_tree), 
                                                      return_counts=True)[1].argmax()]
    
    def predict(self, test_data):
        if test_data.size == test_data.shape[0]:
            return self.predict_datum(test_data)
        return np.array([self.predict_datum(datum) for datum in test_data])

In [58]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)

In [59]:
dt = DecisionTree(loss='gini') 

In [60]:
dt.train(X_train, y_train)

In [61]:
dt.predict(X_test)

array([1, 0, 0, 0, 1, 1, 0, 0, 2, 2, 2, 0, 2, 0, 2, 1, 0, 2, 2, 2, 0, 0,
       2, 0, 1, 2, 0, 1, 2, 1])

In [62]:
rf = RandomForest(10,max_depth=4)

In [63]:
rf.train(X_train,y_train)

In [64]:
rf.predict(X_test)

array([1, 1, 0, 0, 1, 1, 0, 0, 2, 2, 2, 0, 2, 0, 2, 1, 0, 2, 2, 2, 0, 0,
       2, 0, 1, 2, 0, 1, 2, 1])