Internet Resources:

[handson-ml/06_decision_trees.ipynb](https://github.com/ageron/handson-ml/blob/master/06_decision_trees.ipynb)  
[Sefik Ilkin Serengil - A Step by Step CART Decision Tree Example](https://sefiks.com/2018/08/27/a-step-by-step-cart-decision-tree-example/)  
[Google Developers - Tree Classifier from Scratch](https://www.youtube.com/watch?v=LDRbO9a6XPU&t=1s)  
[Victor Zhou - A Simple Explanation of Gini Impurity](https://victorzhou.com/blog/gini-impurity/)


Literature:  

Aurelien geron hands on machine learning page 173

In [1]:
%matplotlib inline
from collections import Counter
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import pandas as pd
import numpy as np

np.random.seed(42)

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

df = pd.read_csv("data/iris.csv").drop(["sepal_width", "sepal_length"], 1)

label_mappings = {label_str:i for i,label_str in enumerate(df["species"].unique())}
df.replace({"species":label_mappings}, inplace=True)

permutation = np.random.permutation(df.index)
X = np.array(df.drop(["species"], 1))[permutation]
y = np.array(df["species"])[permutation]

df.head()

Unnamed: 0,petal_length,petal_width,species
0,1.4,0.2,0
1,1.4,0.2,0
2,1.3,0.2,0
3,1.5,0.2,0
4,1.4,0.2,0


In [2]:
# only has the gini function
class Node_Base:
    # distributions is a list where each element represents the amount of Samples belonging to each classe
    # example [20, 30] -> 20 x Sample belonging to class 0 - 30 x Sample belonging to class 1
    def gini(self, distributions): 
        if sum(distributions) == 0: # if there are no samples for the node
            return 0
        num_samples = sum(distributions)
        impurity = 1
        for i in distributions:
            impurity -= (i/num_samples)**2
        return impurity

    
# end of branch
class TreeNodeEnd(Node_Base):
    def __init__(self, labels, distribution, depth):
        self.label = int(np.bincount(labels).argmax()) # most frequent label in labels
        self.distribution = distribution
        self.depth = depth
        self.gini_score = self.gini(self.distribution)
        
    def predict(self, value):
        return self.label
    
    def print_tree(self):
        print("\t"*self.depth, "Sample is of class {}".format(self.label))
    
    
# decision node
class TreeNode(Node_Base):
    # decision function of the node
    def ask(self, question_value, ask_value):
        if isinstance(ask_value, int) or isinstance(ask_value, float):    
            return ask_value >= question_value
        else:
            return ask_value == question_value
        
    def partition(self, qValue, column, data, labels, unique_label_count):
        # split data into two groups: True and False
        split_data = {True:[], False:[]}
        split_labels = {True:[], False:[]}
        distrb = {True:[0] * unique_label_count, False:[0] * unique_label_count}
        
        for i in range(len(data)):
            branch = self.ask(qValue, data.T[column][i])
            split_data[branch].append(data[i])
            split_labels[branch].append(labels[i])
            distrb[branch][labels[i]] += 1
        
        # gini score is the weighted sum of both branches
        gini_score = (sum(distrb[True]) / len(data) * self.gini(distrb[True])) + (sum(distrb[False]) / len(data) * self.gini(distrb[False]))
        
        split_data = {x:np.array(split_data[x]) for x in split_data}
        split_labels = {x:np.array(split_labels[x]) for x in split_labels}
        return split_data, split_labels, distrb, gini_score
    
    
    def __init__(self, data, labels, unique_label_count, max_depth, depth=0):
        self.label = int(np.bincount(labels).argmax()) # most often occouring label
        self.gini_score = -1
        self.depth = depth
       
        unique_column_values = {i:list(np.unique(column)) for i,column in enumerate(data.T) } # get only unique feature values
        
        self.gini_score = 1
        # iterate through every feature value and find the one that produces the lowest gini score
        for column in unique_column_values:
            for unique_value in unique_column_values[column]:
                # split data into two groups: True and False by running every Sample through the descision function of the node (self.ask)
                # in self.partition unique_value is passed to self.ask as parameter question_value
                split_data, split_labels, distrb, gini_score = self.partition(unique_value, column, data, labels, unique_label_count)
                # the unqiue_value that produces the lowest gini score is stored as self.question_value
                if gini_score < self.gini_score:
                    self.gini_score = gini_score
                    self.split_data = split_data
                    self.split_labels = split_labels
                    self.distribution = distrb
                    self.question_value = unique_value
                    self.feature = column
        
        # recursive binary splitting
        self.child = {True:None, False:None}
        for branch in [True, False]:
            # if either the maximum depth (=number of nodes in a branch) is reached or the training samples for the next node are all of the same class
            if depth+1==max_depth or self.distribution[branch].count(0) >= unique_label_count-1:
                self.child[branch] = TreeNodeEnd(self.split_labels[branch], self.distribution[branch], depth+1) # end of recursion
            else:
                self.child[branch] = TreeNode(self.split_data[branch], self.split_labels[branch], unique_label_count, max_depth, depth+1)
        
        
    def predict(self, value):
        return self.child[self.ask(self.question_value, value[self.feature])].predict(value)
    
    
    def print_tree(self):
        cond = "is greater or equal to" if isinstance(self.question_value, int) or isinstance(self.question_value, float) else "is equal to" 
        print("\t"*self.depth, "if feature {} {} {}:".format(self.feature, cond, self.question_value))
        self.child[True].print_tree()
        print("\t"*self.depth, "else:")
        self.child[False].print_tree()
        
        
        
class DescisionTreeClassifier:
    def __init__(self, max_depth):
        self.max_depth = max_depth
    
    def fit(self, data, labels):
        # recursive function, builds the tree
        unique_column_values = {i:list(np.unique(column)) for i,column in enumerate(data.T) } # .T = Transpose
        unique_label_count = len(np.unique(labels))
        self.root = TreeNode(data, labels, unique_label_count, self.max_depth)
       
    def predict(self, sample):
        # recoursive function, predicts class of sample
        return self.root.predict(sample)
    
    def print_tree(self):
        # recoursive function, prints decision questions of the tree
        self.root.print_tree()
    
    
    
clf = DescisionTreeClassifier(max_depth=2)
clf.fit(X, y)
clf.print_tree()
clf.predict([3,2])

 if feature 0 is greater or equal to 3.0:
	 if feature 1 is greater or equal to 1.8:
		 Sample is of class 2
	 else:
		 Sample is of class 1
 else:
	 Sample is of class 0


2