In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import datasets
from collections import Counter
import pandas as pd

In [2]:
iris = datasets.load_iris()
X = iris.data
Y = iris.target
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, random_state = 42)

In [3]:
heart = pd.read_csv("heart.csv")
print(heart.head(3))
heart_data = heart.values

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1        0      150      0      2.3      0   
1   37    1   2       130   250    0        1      187      0      3.5      0   
2   41    0   1       130   204    0        0      172      0      1.4      2   

   ca  thal  target  
0   0     1       1  
1   0     2       1  
2   0     2       1  


In [4]:
np.shape(heart_data)
X = heart_data[:, :-1]
Y = heart_data[:, -1]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, random_state = 42)

### Using the Decision Tree algorithm I built previously as the foundation

Gini Impurity

In [5]:
def gini(x): 
    counter = Counter(x)
    total = len(x)
#    G = 0
#    for value in counter.values():
#        prob = value/total
#        G += prob * (1 - prob)
#    return G
    GI = 1 
    for value in counter.values():
        prob = value/total 
        GI -= prob**2
    return GI


def weighted_gini(a, b):
    return (len(a) * gini(a) + len(b) * gini(b))/(len(a) + len(b))

def information_gain(original_data, data_left, data_right):
    
    pL = len(data_left)/len(original_data)
    pR = len(data_right)/len(original_data)
    
    return gini(original_data) - pL * gini(data_left) - pR * gini(data_right)  

In [6]:
def identical_values(x):
    test_val = x[0]
    if all(x == test_val):
        return True
    else:
        return False

In [7]:
# Used to find the optimal split within a continuous measurement

def split_cont_measure(data, target):
    data = np.array(data)
    
    # sorting the data to find the mid values
    idx = np.argsort(data)
    sorted_data = data[idx]
    
    past_x = sorted_data[0]
    middle_values = [] 
    for x in sorted_data: 
        if x != past_x:
            average_x = 0.5 * (x + past_x)
            middle_values.append(average_x)
            past_x = x
    
    # Finding the gini for each split
    info_gain = np.zeros(len(middle_values))    
    
    for i, mid_val in enumerate(middle_values):
        smaller_idx = data < mid_val
        larger_idx = np.invert(smaller_idx)
        
        #g = weighted_gini(target[smaller_idx], target[larger_idx])
        info_gain[i] = information_gain(target, target[smaller_idx], target[larger_idx]) 
    
    # Choosing the best split based on gini value
    best_split = middle_values[np.argmax(info_gain)]
    return best_split, np.max(info_gain)
    

In [8]:
# Used to find the optimal split within a discrete measurement
def split_disc_measures(data, target):
    data = np.array(data)
    
    distinct_data = np.unique(data)
    info_gain = np.zeros(len(distinct_data))
    
    for i, x in enumerate(distinct_data):
        match = data == x
        other = data != x
        
        match_t = target[match]
        other_t = target[other]
    
        info_gain[i] = information_gain(target, match_t, other_t) 
        
    best = np.argmax(info_gain)
    return distinct_data[best], info_gain[best]

In [9]:
def find_probability(targets):
    counts = Counter(targets).most_common()
    total = len(targets)
    results = []
    for key, val in counts:
        results.append([key, val/total])
    return results

In [10]:
def mode(x):
    return find_probability(x)[0][0]

In [11]:
class Node:
    
    def __init__(self, data, target, min_node_size = 5):
        self.data = data
        self.target = target
        self.node_gini = gini(self.target)
        self.node_type = "node"
        self.min_node_size = min_node_size
        self.branch_left = None
        self.branch_right = None 
        self.measurement_idx = None 
        self.split_value = None 
        self.split_method = None
                
    def train(self):
        if self.min_node_size >= len(self.data):
            self.node_type = "leaf"
        
        else:
            split_values = np.zeros(len(self.data.T))
            info_gain = np.zeros(len(self.data.T))
            methods = []

            for i, measure in enumerate(self.data.T):
                
                if identical_values(measure): 
                    info_gain[i] = 0 # Prevents the algorithm from trying to split a group of identical values
                    methods.append("FalseSplit")
                
                # If measurement is discrete
                elif type(measure[0]) == np.str_:
                    split_values[i], info_gain[i] = split_disc_measures(measure, self.target)
                    methods.append("=")

                # If measurement is continuous
                else:
                    split_values[i], info_gain[i] = split_cont_measure(measure, self.target)
                    methods.append("<")
            
            best = self.measurement_idx = np.argmax(info_gain)
            self.split_value = split_values[best]
            self.split_method = methods[best]

            # If the split does not provide any more information -> make leaf node
            if info_gain[best] == 0: 
                self.node_type = "leaf"

            # Else split into two new nodes and repeat function 
            else: 
                if self.split_method == "=":
                    left_idx = self.data.T[best] == split_values[best]
                    right_idx = self.data.T[best] != split_values[best]
                else: 
                    left_idx = self.data.T[best] < split_values[best]
                    right_idx = self.data.T[best] >= split_values[best]
                
                
                # checking that neither of the branches are empty
                if len(left_idx) == 0 or len(right_idx) == 0:
                    self.node_type = "leaf"

                else:
                    self.branch_left = Node(self.data[left_idx], self.target[left_idx])
                    self.branch_right = Node(self.data[right_idx], self.target[right_idx])

                    self.branch_left.train()
                    self.branch_right.train()
    
    
    def predict(self, query, style = "c"):
        if self.node_type == "leaf":
            counter = Counter(self.target)

            if style == "r": 
                return find_probability(self.target)

            else: # classification
                most_common = mode(self.target)
                return most_common
            
        # If not end of branch
        else: 
            query_val = query[self.measurement_idx]
            
            if self.split_method == "=":
                if query_val == self.split_value:
                    return left_branch.predict(query, style = style)
                else: 
                    return right_branch.predict(query, style = style)
                
            else: 
                if query_val < self.split_value:
                    return self.branch_left.predict(query, style = style)
                else: 
                    return self.branch_right.predict(query, style = style)
    

In [12]:
dt = Node(x_train, y_train)
dt.train()
predictions = np.zeros(len(x_test))
for i, x in enumerate(x_test):
    predictions[i] = dt.predict(x)
np.average(predictions == y_test)

0.75

### Creating a Random Forest out of Decision Trees

In [33]:
def data_subsample(x, y):
    # Bootstrapping the data
    num_data = len(x)
    data_idx = np.arange(num_data)
    subset_idx = np.random.choice(data_idx, num_data, replace = True)
    subset_x = x[subset_idx]
    subset_y = y[subset_idx]
    
    return subset_x, subset_y

def feature_selection(x):
    # Random selection of features
    num_features = len(x.T)
    feature_idx = np.arange(num_features)
    subset_idx = np.random.choice(feature_idx, int(np.sqrt(num_features)), replace = False)
    x = x.T[subset_idx].T
    return x

In [39]:
a = np.arange(45).reshape(9, 5)
print(a)

[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]
 [25 26 27 28 29]
 [30 31 32 33 34]
 [35 36 37 38 39]
 [40 41 42 43 44]]


In [34]:
def initialise_forest(data, target, min_node_size, num_trees):
    forest = []
    for tree in range(num_trees):
        data_subset, target_subset = data_subsample(data, target)
        data_subset = feature_selection(data_subset)
        forest.append(Node(data_subset, target_subset, min_node_size))
    return forest 

In [35]:
class Random_Forest:
    def __init__(self, data, target, min_node_size = 1, num_trees = 100):
        self.data = data
        self.target = target
        self.num_trees = num_trees
        self.forest = initialise_forest(data, target,  min_node_size, num_trees)

    def train(self):
        for tree in self.forest:
            tree.train()
    
    def predict(self, query_data, style = "c"):
        predictions = np.arange(self.num_trees)

        for i, tree in enumerate(self.forest): 
            pred = tree.predict(query_data, style)
            predictions[i] = pred 
            
        most_common = mode(predictions)
        return most_common
        

In [36]:
trial = Random_Forest(x_train, y_train, num_trees = 100)
trial.train()

In [37]:
trial_predictions = np.zeros(len(x_test))
for i, query in enumerate(x_test):
    trial_predictions[i] = trial.predict(query)

In [38]:
success = np.average(trial_predictions == y_test)
print(success)

0.42
