* Pure python implementation of decision trees

* Decision Tree is a binary tree that recursively splits a dataset until it is left with pure leaf nodes.
* Its a greedy algorithm

$$
H(S) = - \sum_{i=1}^{n} p(x_i) \log_2 p(x_i)
$$

$$
IG(T, A) = H(T) - \sum_{v \in \text{Values}(A)} \frac{|T_v|}{|T|} H(T_v)
$$

In [None]:
# 1. Select a feature
# 2. Select feature value randomly from range
# 3. Calculate entropy (purity of sample)
#    a. Proportion of +(ve) examples -> p =  count(feature) groupby target filter target == 1 / sum(feature)
#    b. Proportion of -(ve) examples -> n =  1 - p
#    c. entropy = -plog2(p) - nlog2(n)
#    d. entropy (general) = sigma(-Pnlog2(Pn))
# 4. Split feature according to entropy and do 3. for each split (binary tree)

#### Doubts

* During get_best_split() , only find splits whose len > min_samples_split
    * 
* During get_best_split() ,  find splits whose len > 0
    * 

In [4]:
import math
from sklearn.datasets import load_iris
import pandas as pd

In [3]:
# Load the Iris dataset
iris = load_iris()

# Convert to a pandas DataFrame for easier analysis
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target

# Display the first few rows
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


### Decision Tree Classifier

In [11]:
class Node():
    
    def __init__(self,feature = None, threshold = None, \
                 left = None, right = None, info_gain = None, value = None):
        
        # For decision nodes
        self.feature = feature      # feature name or index
        self.threshold = threshold  # split value
        self.left = left            # left child -> create by calling Node.left = Node
        self.right = right          
        self.info_gain = info_gain  # IG of Node compared to its children
        
        # For leaf node
        self.value = value          # This is only for leaf, its the value of 
                                    # the majority class
        
class DecisionTreeClassifier():
    
    def __init__(self,min_samples_split = 2, max_depth = 2):
        
        self.root = None
        
        self.min_samples_split = min_samples_split # The min number of samples in a node to be treated as leaf
        self.max_depth = max_depth                 # max depth from root 
    
    def build_tree(self, dataset, current_depth = 0):
        '''recursive function to build the binary tree'''
        
        X, y = dataset.iloc[:,:-1], dataset.iloc[:,-1]
        
        best_split = get_best_split(dataset)
        if best_split['gain'] > 0:
            left_subtree = self.build_tree(best_split['dataset_left'],current_dept+1)
            right_subtree = self.build_tree(best_split['dataset_right'],current_dept+1)
            
            return Node(best_split['feature'],best_split['threshold'],\
                       best_split['dataset_left'], best_split['dataset_right'],\
                       best_split['gain'])
        
        leaf_value = calculate_leaf(y)
        
        return Node(value = leaf_value)
            
        
        
        
        

In [9]:
def get_best_split(X:pd.DataFrame, y:pd.Series) -> dict:
    
    split = {'feature': None,'threshold': None,\
             'gain': -math.inf,'dataset_left': None,'dataset_right': None}
    X, y = dataset.iloc[:,:-1],dataset.iloc[:,-1]
    
    for feature in X.columns:
        unique_values = dataset[feature].unique()
        for threshold in unique_values:
            left_y,right_y = y[dataset[feature] <= threshold], y[dataset[feature] > threshold]
            ig = info_gain(y,left_y,right_y)
            if ig > split['gain']:
                split['feature'] = feature
                split['threshold'] = float(threshold)
                split['gain'] = ig
                
    split['dataset_left'] = dataset[dataset[split['feature']] <= split['threshold']]
    split['dataset_right'] = dataset[dataset[split['feature']] > split['threshold']]
    
    return split
            
split = get_best_split(iris_df)    

In [25]:
def info_gain(y:pd.Series, left_y:pd.Series, right_y:pd.Series) -> float:
    
    entropy_y = entropy(y)
    entropy_l = entropy(left_y)
    entropy_r = entropy(right_y)
    entropy_child = ((len(left_y)/len(y)) * entropy_l) + ((len(right_y)/len(y)) * entropy_r)
    
    return entropy_y - entropy_child
    
    
    

In [8]:
def entropy(y:pd.Series) -> float:
    
    prob = y.value_counts(normalize = True).reset_index()
    
    return sum([-p * math.log2(p) for p in prob.proportion])
    

In [45]:
def calculate_leaf(y:pd.Series) -> float:
    
    y = list(y)
    
    return max(y, key = y.count)

### Akhil's version

In [69]:
class Node():
    
    ## Give to DecisionTree class
    min_samples_split = 10
    max_depth = 10
    
    def __init__(self,X: pd.DataFrame = None, y: pd.Series = None ,\
                 feature: str = None, threshold: float = None , \
                 left: Node = None , right: Node = None, \
                 info_gain: float = -math.inf, value: float = None, current_depth: int = 0):
        
        self.X = X
        self.y = y
        self.feature = feature      
        self.threshold = threshold  
        self.left = left            
        self.right = right          
        self.info_gain = info_gain  
        self.value = value
        self.current_depth = current_depth
        
    def build_tree(self):

        
        if len(self.y) >= Node.min_samples_split and self.current_depth <= Node.max_depth:
            
            left_X, left_y, right_X, right_y = self.get_best_split()
            self.left = Node(X = left_X, y = left_y, current_depth = self.current_depth + 1)
            self.right = Node(X = right_X, y = right_y, current_depth = self.current_depth + 1)
            self.left.build_tree()
            self.right.build_tree()
            
        else:
            #print(self.y)
            self.value = calculate_leaf(self.y)
        
        
        
    def get_best_split(self):

        X, y = self.X, self.y


        for feature in X.columns:
            unique_values = X[feature].unique()
            for threshold in unique_values:
                left_y,right_y = y[X[feature] <= threshold], y[X[feature] > threshold]
                # Implementing a check for null left or right
                if len(left_y) > 0 and len(right_y) > 0:
                    ig = info_gain(y,left_y,right_y)
                    if ig > self.info_gain:
                        self.feature = feature
                        self.threshold = threshold
                        self.info_gain = ig

        left_X = X[X[self.feature] <= self.threshold]
        left_y = y[X[self.feature] <= self.threshold]
        right_X = X[X[self.feature] > self.threshold]
        right_y = y[X[self.feature] > self.threshold]
        

        return left_X, left_y, right_X, right_y

In [32]:
X = iris_df.iloc[:,:-1]
y = iris_df.iloc[:,-1]
best_split = get_best_split(X, y)

In [71]:
root = Node(X = X, y = y)
# root.get_best_split()
root.build_tree()