* Pure python implementation of decision trees

* Decision Tree is a binary tree that recursively splits a dataset until it is left with pure leaf nodes.
* Its a greedy algorithm

$$
H(S) = - \sum_{i=1}^{n} p(x_i) \log_2 p(x_i)
$$

$$
IG(T, A) = H(T) - \sum_{v \in \text{Values}(A)} \frac{|T_v|}{|T|} H(T_v)
$$

In [None]:
# 1. Select a feature
# 2. Select feature value randomly from range
# 3. Calculate entropy (purity of sample)
#    a. Proportion of +(ve) examples -> p =  count(feature) groupby target filter target == 1 / sum(feature)
#    b. Proportion of -(ve) examples -> n =  1 - p
#    c. entropy = -plog2(p) - nlog2(n)
#    d. entropy (general) = sigma(-Pnlog2(Pn))
# 4. Split feature according to entropy and do 3. for each split (binary tree)

### Doubts

#### What are the stopping conditions for splitting

* Maximum Depth Reached:

    * The tree is allowed to grow only up to a predefined maximum depth. Once this limit is reached, the node is not split further.
* Minimum Samples for a Split:

    * A node is split only if it contains more than a certain number of samples. If the number of samples in a node falls below this threshold, the split will not occur.
* Minimum Samples in a Leaf Node:

    * After splitting, the resulting child nodes must have at least a minimum number of samples. If this condition is violated, no split occurs.
* Impurity Improvement Below a Threshold:

    * Splitting is stopped when the improvement in the chosen impurity metric (like Gini Impurity or Entropy) is less than a predefined threshold. If the split does not significantly improve the purity of the node, it is not performed.
* All Features Used:

    * If all features have been used up or if no further features provide better splits, the process stops.
* Pure Nodes:

    * If all the samples in a node belong to the same class, the node is considered "pure," and further splitting is unnecessary.


In [5]:
import math
from sklearn.datasets import load_iris
import pandas as pd

In [6]:
# Load the Iris dataset
iris = load_iris()

# Convert to a pandas DataFrame for easier analysis
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target

# Display the first few rows
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


### Decision Tree Classifier

In [11]:
# class Node():
    
#     def __init__(self,feature = None, threshold = None, \
#                  left = None, right = None, info_gain = None, value = None):
        
#         # For decision nodes
#         self.feature = feature      # feature name or index
#         self.threshold = threshold  # split value
#         self.left = left            # left child -> create by calling Node.left = Node
#         self.right = right          
#         self.info_gain = info_gain  # IG of Node compared to its children
        
#         # For leaf node
#         self.value = value          # This is only for leaf, its the value of 
#                                     # the majority class
        
# class DecisionTreeClassifier():
    
#     def __init__(self,min_samples_split = 2, max_depth = 2):
        
#         self.root = None
        
#         self.min_samples_split = min_samples_split # The min number of samples in a node to be treated as leaf
#         self.max_depth = max_depth                 # max depth from root 
    
#     def build_tree(self, dataset, current_depth = 0):
#         '''recursive function to build the binary tree'''
        
#         X, y = dataset.iloc[:,:-1], dataset.iloc[:,-1]
        
#         best_split = get_best_split(dataset)
#         if best_split['gain'] > 0:
#             left_subtree = self.build_tree(best_split['dataset_left'],current_dept+1)
#             right_subtree = self.build_tree(best_split['dataset_right'],current_dept+1)
            
#             return Node(best_split['feature'],best_split['threshold'],\
#                        best_split['dataset_left'], best_split['dataset_right'],\
#                        best_split['gain'])
        
#         leaf_value = calculate_leaf(y)
        
#         return Node(value = leaf_value)
            
        
        
        
        

In [9]:
# def get_best_split(X:pd.DataFrame, y:pd.Series) -> dict:
    
#     split = {'feature': None,'threshold': None,\
#              'gain': -math.inf,'dataset_left': None,'dataset_right': None}
#     X, y = dataset.iloc[:,:-1],dataset.iloc[:,-1]
    
#     for feature in X.columns:
#         unique_values = dataset[feature].unique()
#         for threshold in unique_values:
#             left_y,right_y = y[dataset[feature] <= threshold], y[dataset[feature] > threshold]
#             ig = info_gain(y,left_y,right_y)
#             if ig > split['gain']:
#                 split['feature'] = feature
#                 split['threshold'] = float(threshold)
#                 split['gain'] = ig
                
#     split['dataset_left'] = dataset[dataset[split['feature']] <= split['threshold']]
#     split['dataset_right'] = dataset[dataset[split['feature']] > split['threshold']]
    
#     return split
            
# split = get_best_split(iris_df)    

In [13]:
def info_gain(y:pd.Series, left_y:pd.Series, right_y:pd.Series, node: 'Node') -> float:
    
    entropy_y = entropy(y)
    entropy_l = entropy(left_y)
    entropy_r = entropy(right_y)
    entropy_child = ((len(left_y)/len(y)) * entropy_l) + ((len(right_y)/len(y)) * entropy_r)
    
    return entropy_y - entropy_child
    
    
    

In [14]:
def entropy(y:pd.Series) -> float:
    
    prob = y.value_counts(normalize = True).reset_index()
    
    return sum([-p * math.log2(p) for p in prob.proportion])
    

In [15]:
def calculate_leaf(y:pd.Series) -> float:
    
    y = list(y)
    
    return max(y, key = y.count)

In [156]:
root.right.right.value

np.int64(2)

### Akhil's version

In [219]:
class Node():
    
    def __init__(self,X: pd.DataFrame = None, y: pd.Series = None ,\
                 feature: str = None, threshold: float = None , \
                 left: 'Node' = None , right: 'Node' = None, \
                 entropy_node: float = None, entropy_left_child: float = None ,\
                 length_node: int = None, length_left_child: int = None, length_right_child: int = None,\
                 w_left: float = None, w_right: float = None,\
                 entropy_right_child: float = None, entropy_children: float = None,path: str = None, \
                 max_info_gain: float = -math.inf, value: float = None, current_depth: int = 0):
        
        # Basic Node Params
        self.X = X
        self.y = y
        self.feature = feature      
        self.threshold = threshold  
        self.left = left            
        self.right = right
        self.current_depth = current_depth
        self.path = path
            
        # Information Gain Params
        self.entropy_node = entropy_node
        self.entropy_left_child = entropy_left_child
        self.entropy_right_child = entropy_right_child
        self.entropy_children = entropy_children
        self.length_node = length_node
        self.length_left_child = length_left_child
        self.length_right_child = length_right_child
        self.w_left = w_left
        self.w_right = w_right
        self.info_gain = info_gain
        self.max_info_gain = max_info_gain
        
        # Leaf Params
        self.value = value
        
        
#     def build_tree(self):

        
#         if len(self.y) >= self.min_samples_split and self.current_depth <= self.max_depth:
            
#             left_X, left_y, right_X, right_y = self.get_best_split()
#             if self.info_gain > self.min_info_gain :
#                 self.left = Node(X = left_X, y = left_y, current_depth = self.current_depth + 1)
#                 self.right = Node(X = right_X, y = right_y, current_depth = self.current_depth + 1)
#                 self.left.build_tree()
#                 self.right.build_tree()
#             else:
#                 self.value = self.y.mode()[0]
            
#         else:
        
#             self.value = self.y.mode()[0]
            



In [237]:
class DecisionTreeClassifier():
    
    
    def __init__(self, min_samples_split = 3, max_depth = 15, min_info_gain = .001):

        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.min_info_gain = min_info_gain
        
        self.root = None
        
    
    def build_tree(self, node):

        
        if len(node.y) >= self.min_samples_split and node.current_depth <= self.max_depth:
            
            left_X, left_y, right_X, right_y = self.get_best_split(node)
            if node.max_info_gain > self.min_info_gain :
                node.left = Node(left_X, left_y, current_depth = node.current_depth + 1, path = 'left')
                node.right = Node(right_X, right_y, current_depth = node.current_depth + 1,path = 'right')
                self.build_tree(node.left)
                self.build_tree(node.right)
            else:
                node.value = node.y.mode()[0]
            
        else:
        
            node.value = node.y.mode()[0]

            
         
            
    def get_best_split(self, node):

        X, y = node.X, node.y

        #node.info_gain = -math.inf
        
        for feature in X.columns:
            unique_values = X[feature].unique()
            for threshold in unique_values:
                # Split 
                left_y,right_y = y[X[feature] <= threshold], y[X[feature] > threshold]
                
                # Implementing a check for null left or right
                if len(left_y) > 0 and len(right_y) > 0: # Don't consider edge of X.feature
                    
                    entropy_left, entropy_right, length_node, length_left, \
                    length_right, w_left, w_right, entropy_children, \
                    gain = self.info_gain(left_y,right_y, node)

                    if gain > node.max_info_gain:
                        node.max_info_gain = gain
                        node.feature = feature
                        node.threshold = threshold
                        node.entropy_left_child = entropy_left
                        node.entropy_right_child = entropy_right
                        node.length_node = length_node
                        node.length_left_child = length_left
                        node.length_right_child = length_right
                        node.w_left = w_left
                        node.w_right = w_right
                        node.entropy_children = entropy_children
     
        

        left_X = X[X[node.feature] <= node.threshold]
        left_y = y[X[node.feature] <= node.threshold]
        right_X = X[X[node.feature] > node.threshold]
        right_y = y[X[node.feature] > node.threshold]
        

        return left_X, left_y, right_X, right_y
    
                
    def info_gain(self, left_y:pd.Series, right_y:pd.Series, node: 'Node') -> float:

        node.entropy_node = self.entropy(node.y)
        entropy_left = entropy(left_y)
        entropy_right = entropy(right_y)
        length_node = len(node.y)
        length_left = len(left_y)
        length_right = len(right_y)
        w_left = length_left/length_node
        w_right = length_right/length_node
        entropy_children = w_left * entropy_left + w_right * entropy_right
        gain = node.entropy_node - entropy_children

        return entropy_left, entropy_right, length_node, length_left, \
                    length_right, w_left, w_right, entropy_children, \
                    gain
    
    
    def entropy(self, y:pd.Series) -> float:
    
        prob = y.value_counts(normalize = True).reset_index()

        return sum([-p * math.log2(p) for p in prob.proportion])
    
    
    def fit(self, X, y):
        
        self.root = Node(X, y, path = 'root')
        #print(self.root.max_info_gain)
        self.build_tree(self.root)
        

    def print_tree(self, node = None):
        
        if node == None:
            node = self.root
            
        if node.value is not None:
            print(f"{node.current_depth - 1}->{node.path}->{node.current_depth}:`Leaf`[[{node.value}]]")
            
        else:
            if node.current_depth == 0:
                print(f"{node.current_depth}->{node.path}->{node.current_depth}:`Decision`({node.feature})[{node.threshold}]")
                self.print_tree(node.left)
                self.print_tree(node.right)
            else:
                print(f"{node.current_depth - 1}->{node.path}->{node.current_depth}:`Decision`({node.feature})[{node.threshold}]")
                self.print_tree(node.left)
                self.print_tree(node.right)
                
        
        

        
        
        
            

In [11]:
X = iris_df.iloc[:,:-1]
y = iris_df.iloc[:,-1]


In [238]:
tree = DecisionTreeClassifier()
tree.fit(X,y)
tree.print_tree()

0->root->0:`Decision`(petal length (cm))[1.9]
0->left->1:`Leaf`[[0]]
0->right->1:`Decision`(petal width (cm))[1.7]
1->left->2:`Decision`(petal length (cm))[4.9]
2->left->3:`Decision`(petal width (cm))[1.6]
3->left->4:`Leaf`[[1]]
3->right->4:`Leaf`[[2]]
2->right->3:`Decision`(petal width (cm))[1.5]
3->left->4:`Leaf`[[2]]
3->right->4:`Decision`(sepal length (cm))[6.7]
4->left->5:`Leaf`[[1]]
4->right->5:`Leaf`[[2]]
1->right->2:`Decision`(petal length (cm))[4.8]
2->left->3:`Decision`(sepal length (cm))[5.9]
3->left->4:`Leaf`[[1]]
3->right->4:`Leaf`[[2]]
2->right->3:`Leaf`[[2]]
