In [1]:
import numpy as np
from collections import Counter

In [2]:
class Node:
    def __init__(self,left = None, right = None, threshold = None, feature = None, value = None):
        self.left = left
        self.right = right
        self.threshold = threshold
        self.feature = feature
        self.value = value

In [3]:
class Decision_Tree:
    def __init__(self,min_sample_split = 2, max_depth = 100, n_features = None, tech = "gini"):
        self.min_sample_split = min_sample_split,
        self.max_depth = max_depth
        self.n_features = n_features
        self.tech = tech
        self.criteria = { "gini": self.gini, "entropy" : self.entropy }

    def fit(self,X,y):
        if self.n_features is None:
            self.n_features = X.shape[1]
        self.root = self.grow_tree(X,y,depth = 0)

    def grow_tree(self,X,y,depth):
        n_rows , n_cols = X.shape
        n_labels = len(np.unique(y))
        
        #Stop conition
        if (self.max_depth<=depth or self.min_sample_split > n_rows or  n_labels == 1):
            val = self.most_frequent(y)
            return Node(value = val)

        #Best split
        feature_index = np.random.choice(n_cols, self.n_features, replace = False) 
        best_threshold, best_feature = self.best_split(X,y,feature_index)

        #Child node
        left_index, right_index = self.split(X[:,best_feature],best_threshold)

        left = self.grow_tree(X[left_index,:],y[left_index],depth+1)
        right = self.grow_tree(X[right_index,:],y[right_index],depth+1)

        return Node(left,right,best_threshold,best_feature)

    def most_frequent(self,y):
        if len(y)==0:
            return None
        count = Counter(y)
        frequency = count.most_common(1)[0][0]
        return frequency
        
    def split(self,X_col,thres):
        left_index = np.argwhere(X_col>=thres).flatten()
        right_index = np.argwhere(X_col<thres).flatten()
        return left_index, right_index

    def entropy(self,y):
        n = len(y)
        P = np.bincount(y)/n
        return (-np.sum([p*np.log(p) for p in P if p > 0]))

    def gini(self,y):
        n = len(y)
        p = np.bincount(y)/n
        return 1-np.sum([p**2 for p in P if p > 0])
        
    def info_gain_ratio(self,X_col,thres):
        info_gain = self.info_gain(X_col,thres)
        left_index, right_index = self.split(X_col,thres)
        n = len(left_index+right_index)
        p_left = len(left_index)/n
        p_right = len(right_index)/n
        split_info = - (p_left * np.log2(p_left) + p_right * np.log2(p_right))
        return info_gain/ split_info if split_info! = 0 else 0
        
    def chi_square(self,left_y,right_y):
        total = len(left_y+right_y)
        left_count = np.bincount(left_y)
        right_count = np.bincount(right_y)
        expected_left = 
        
    
    def info_gain(self,X_col,y,thres):
        parent = self.criteria[self.tech](y)

        left_index,right_index = self.split(X_col,thres)

        if len(left_index) == 0 or len(right_index) == 0:
            return 0

        #Weighted average of children
        n = len(y)
        weight_left, weight_right = len(left_index)/n , len(right_index)/n
        info_left, info_right = self.criteria[self.tech](y[left_index]), self.criteria[self.tech](y[right_index])
        child = weight_left*info_left + weight_right*info_right

        return parent - child
        
        
    def best_split(self,X,y,feature_index):
        best_info_gain = -1

        split_index = None
        split_threshold = None

        for index in feature_index:
            X_col = X[:,index]
            X_col_threshold = np.unique(X_col)
            for thres in X_col_threshold:
                gain = self.info_gain(X_col,y,thres)
                if gain > best_info_gain:
                    split_index = index
                    split_threshold = thres
                    best_info_gain = gain
                    
        return split_threshold, split_index


In [5]:
import pandas as pd

# Example dataset
data = {
    'Feature1': ['A', 'A', 'B', 'B', 'A'],
    'Feature2': ['X', 'Y', 'X', 'Y', 'X'],
    'Target': [1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)
print(df)

  Feature1 Feature2  Target
0        A        X       1
1        A        Y       0
2        B        X       1
3        B        Y       0
4        A        X       1


In [6]:
def calculate_contingency_table(df, feature, target):
    return pd.crosstab(df[feature], df[target])

# Example for Feature1
contingency_table = calculate_contingency_table(df, 'Feature1', 'Target')
print(contingency_table)


Target    0  1
Feature1      
A         1  2
B         1  1
