In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import datasets
from collections import Counter

In [2]:
iris = datasets.load_iris()
X = iris.data
Y = iris.target
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, random_state = 42)

### Using the Decision Tree algorithm I built previously as the foundation

Gini Impurity

In [3]:
def gini(x): 
    counter = Counter(x)
    total = len(x)
    G = 0
    for value in counter.values():
        prob = value/total
        G += prob * (1 - prob)
    return G

def weighted_gini(a, b):
    return (len(a) * gini(a) + len(b) * gini(b))/(len(a) + len(b))

In [235]:
def identical_values(x):
    test_val = x[0]
    print(test_val)
    if all(x == test_val):
        return True
    else:
        return False

In [195]:
# Used to find the optimal split within a continuous measurement

def split_cont_measure(data, target):
    data = np.array(data)
    
    # sorting the data to find the mid values
    idx = np.argsort(data)
    sorted_data = data[idx]
    
    past_x = sorted_data[0]
    middle_values = [] 
    for x in sorted_data: 
        if x != past_x:
            average_x = 0.5 * (x + past_x)
            middle_values.append(average_x)
            past_x = x
    
    # Finding the gini for each split
    gini_values = np.zeros(len(middle_values))    
    
    for i, mid_val in enumerate(middle_values):
        smaller_idx = data < mid_val
        larger_idx = np.invert(smaller_idx)
        
        g = weighted_gini(target[smaller_idx], target[larger_idx])
        gini_values[i] = g 
    
    # Choosing the best split based on gini value
    print("Gini Values: ")
    print(gini_values)
    if len(gini_values) == 0:
        print(data)
    print(" ")
    best_split = middle_values[np.argmin(gini_values)]
    return best_split, np.min(gini_values)
    

In [196]:
# Used to find the optimal split within a discrete measurement
def split_disc_measures(data, target):
    data = np.array(data)
    
    distinct_data = np.unique(data)
    gini_values = np.zeros(len(distinct_data))
    
    for i, x in enumerate(distinct_data):
        match = data == x
        other = data != x
        
        match_t = target[match]
        other_t = target[other]
    
        gini_values[i] = weighted_gini(match_t, other_t) 
        
    best = np.argmin(gini_values)
    return distinct_data[best], gini_values[best]

In [197]:
def find_probability(targets):
    counts = Counter(targets).most_common()
    total = len(targets)
    results = []
    for key, val in counts:
        results.append([key, val/total])
    return results

In [236]:
class Node:
    
    def __init__(self, data, target, min_node_size = 5):
        self.data = data
        self.target = target
        self.node_gini = gini(self.target)
        self.node_type = "node"
        self.min_node_size = min_node_size
        self.branch_left = None
        self.branch_right = None 
        self.measurement_idx = None 
        self.split_value = None 
        self.split_method = None
    
    
                
    def train(self):
        if self.min_node_size >= len(self.data):
            self.node_type = "leaf"
        
        else:
            split_values = np.zeros(len(self.data.T))
            gini_values = np.zeros(len(self.data.T))
            methods = []

            for i, measure in enumerate(self.data.T):
                
                if identical_values(measure): 
                    gini_values[i] = 1 # Prevents the algorithm from trying to split a group of identical values

                # If measurement is discrete
                elif type(measure[0]) == np.str_:
                    split_values[i], gini_values[i] = split_disc_measures(measure, self.target)
                    methods.append("=")

                # If measurement is continuous
                else:
                    split_values[i], gini_values[i] = split_cont_measure(measure, self.target)
                    methods.append("<")

            best = self.measurement_idx = np.argmin(gini_values)
            self.split_value = split_values[best]
            self.split_method = methods[best]

            # If the split does not improve gini impurity -> make leaf node
            if gini_values[best] >= self.node_gini:
                self.node_type = "leaf"

            # Else split into two new nodes and repeat function 
            else: 
                if self.split_method == "=":
                    left_idx = self.data.T[best] == split_values[best]
                    right_idx = self.data.T[best] != split_values[best]
                else: 
                    left_idx = self.data.T[best] < split_values[best]
                    right_idx = self.data.T[best] >= split_values[best]
                
                
                # checking that neither of the branches are empty
                if len(left_idx) == 0 or len(right_idx) == 0:
                    self.node_type = "leaf"

                else:
                    self.branch_left = Node(self.data[left_idx], self.target[left_idx])
                    self.branch_right = Node(self.data[right_idx], self.target[right_idx])

                    self.branch_left.train()
                    self.branch_right.train()

                    
    
    
    
    def predict(self, query, style = "classification"):
        if self.node_type == "leaf":
            counter = Counter(self.target)

            if style == "regression": 
                return find_probability(self.target)

            else: # classification
                most_common = find_probability(self.target)[0][0]
                return most_common
            
        # If not end of branch
        else: 
            query_val = query[self.measurement_idx]
            
            if self.split_method == "=":
                if query_val == self.split_value:
                    return left_branch.predict(query, style = style)
                else: 
                    return right_branch.predict(query, style = style)
                
            else: 
                if query_val < self.split_value:
                    return self.branch_left.predict(query, style = style)
                else: 
                    return self.branch_right.predict(query, style = style)
    

In [237]:
dt = Node(x_train, y_train)
dt.train()

5.7
Number of data: 100
Gini Values: 
[ 0.65858586  0.64371134  0.63604167  0.61204301  0.60369565  0.59516484
  0.59269103  0.55561772  0.488       0.48444853  0.47195839  0.45392361
  0.46517569  0.48719192  0.5032      0.51131313  0.51873563  0.53356537
  0.53204991  0.54624591  0.57093333  0.5816317   0.59130604  0.58390244
  0.59963455  0.60740937  0.61333333  0.6010989   0.60891304  0.62404255
  0.63136842  0.63854167  0.64556701]
 
2.9
Number of data: 100
Gini Values: 
[ 0.65939394  0.65972509  0.65078341  0.63860806  0.6323588   0.61813008
  0.604       0.59469923  0.57149235  0.54591845  0.55966531  0.55413333
  0.55325     0.58992424  0.62336957  0.63858156  0.64371134  0.65122449
  0.65858586]
 
4.2
Number of data: 100
Gini Values: 
[ 0.65858586  0.64371134  0.61204301  0.5297619   0.45246753  0.41486486
  0.37408451  0.34492754  0.359375    0.38440285  0.40486111  0.41355641
  0.4213073   0.44740922  0.45402576  0.4532      0.44871795  0.44146538
  0.44506732  0.4245      0

In [238]:
predictions = np.zeros(len(x_test))
for i, x in enumerate(x_test):
    predictions[i] = dt.predict(x)
    

In [239]:
np.average(predictions == y_test)

0.97999999999999998

### Creating a Random Forest out of Decision Trees

In [240]:
def random_subset(x, num_data, replace = True):
    data_idx = np.arange(len(x))
    subset_idx = np.random.choice(data_idx, num_data, replace)
    subset_x = x[subset_idx]
    return subset_x

In [241]:
def data_feature_selection(x, y):
    # Bootstrapping the data
    num_data = len(x)
    data_idx = np.arange(num_data)
    subset_idx = np.random.choice(data_idx, num_data, replace = True)
    subset_x = x[subset_idx]
    subset_y = y[subset_idx]
    
    # Random selection of features
    num_features = len(x.T)
    feature_idx = np.arange(num_features)
    subset_idx = np.random.choice(feature_idx, int(np.sqrt(num_features)), replace = False)
    subset_x = x.T[subset_idx]
    
    return subset_x.T, subset_y

In [242]:
class Random_Forest:
    def __init__(self, data, target, min_node_size = 5, num_trees = 50):
        self.data = data
        self.target = target
        self.forest = []
        for tree in range(num_trees):
            data_subset, target_subset = data_feature_selection(self.data, self.target)
            self.forest.append(Node(data_subset, target_subset))

    def train(self):
        for tree in self.forest:
            tree.train()
    
    def predict(self, query_data, style = "classification"):
        self.predictions = np.arange(num_trees)
        for i, tree in enumerate(self.forest): 
            #self.predictions[i] = tree.predict(query_data, style)
            print(tree.predict(query_data, style))

In [243]:
a = np.arange(10)
b = np.ones(10)
print(a)
print(b)

print(a.min())
print(b.min())



[0 1 2 3 4 5 6 7 8 9]
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
0
1.0


In [244]:
print("a: {}".format(different_values(a)))
print("b: {}".format(different_values(b)))

0
a: True
1.0
b: False


In [245]:
trial = Random_Forest(x_train, y_train)
trial.train()

2.9
Number of data: 100
Gini Values: 
[ 0.65252525  0.65814433  0.65517665  0.65294261  0.65541528  0.65815718
  0.65653333  0.65400271  0.64871434  0.64662463  0.65391226  0.6576
  0.65625     0.65810606  0.65608696  0.65475177  0.64920962  0.64163265
  0.65010101]
 
1.3
Number of data: 100
Gini Values: 
[ 0.65814433  0.65659674  0.65555094  0.65820301  0.65733333  0.65769051
  0.65765766  0.65837747  0.65582923  0.65525492  0.65139573  0.65046154
  0.65179557  0.64684432  0.6551049   0.65387914  0.6454902   0.64133333
  0.63443223  0.64210526  0.65428571]
 
2.9
Number of data: 91
Gini Values: 
[ 0.65323565  0.66033966  0.65750916  0.65448643  0.65862709  0.65932393
  0.66028638  0.65760617  0.65381383  0.65697756  0.65998932  0.66049189
  0.65929048  0.65917837  0.65629551  0.65516053  0.6502664   0.64279541
  0.65177045]
 
1.3
Number of data: 91
Gini Values: 
[ 0.66033966  0.66019488  0.65967878  0.65910841  0.65860806  0.65986057
  0.66042966  0.65835447  0.65051955  0.64697666  0.

IndexError: list index out of range