In [6]:
import numpy as np
import pandas as pd
from graphviz import Digraph

In [15]:
train = pd.read_csv("../Decision Trees/train_2v.csv")
test = pd.read_csv("../Decision Trees/test_2v.csv")

In [16]:
train.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


## Shannon's entropy

$$S = -\sum^N_{i=1} p_i \log_2 p_i$$

In [28]:
def entropy(freqs):
    total = sum(freqs)
    ent = 0
    for f in freqs:
        prob = f * 1.0 / total
        if abs(prob) > 1e-8:
            ent += -prob * np.log2(prob)
            
    return ent

In [30]:
entropy([5, 5])

1.0

## Information gain
$$IG(Q) = S_0 - \sum^q_{i=1} \frac{N_i}{N} S_i$$

In [31]:
def information_gain(before_split_freq, after_split_freq):
    gain = entropy(before_split_freq)
    total = sum(before_split_freq)
    
    for freq in after_split_freq:
        ratio = sum(freq) / total
        gain -= ratio * entropy(freq)
    
    return gain

In [36]:
class Node:
    col_name: str
    split_value: float
    label: bool
        
    left = None
    right = None
        
    def __init__(self, col_name, split_value, label):
        self.col_name = col_name
        self.split_value = split_value
        self.label = label
        
    def __repr__(self):
        return f"{self.col_name} <= {self.split_value}"

In [37]:
train = pd.read_csv("../Decision Trees/train_2v.csv")

In [38]:
train.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [39]:
classes = ['hypertension', 'heart_disease']

In [45]:
def build_tree(df):
    best_split_class = None
    best_split_value = None
    best_split_infogain = 0
    
    for class_ in classes:
        unique = df[class_].unique().tolist()
        unique.sort()
        
        for i in range(len(unique) - 1):
            split_value = (unique[i] + unique[i + 1]) / 2
            
            split_1 = df[df[class_] <= split_value]
            split_2 = df[df[class_] > split_value]
            
            infogain = information_gain(
                df.stroke.value_counts().tolist(),
                [
                    split_1.stroke.value_counts().tolist(),
                    split_2.stroke.value_counts().tolist()
                ]
            )
            
            if infogain > best_split_infogain:
                best_split_class = class_
                best_split_value = split_value
                best_split_infogain = infogain
    
    node = Node(best_split_class, best_split_value, None)
    
    split_1 = df[df[best_split_class] <= best_split_value]
    split_2 = df[df[best_split_class] > best_split_value]
    
    node.left = build_tree(split_1)
    node.right = build_tree(split_2)
    
    return node

In [46]:
build_tree(train)

0.5 heart_disease 0.00516828835043
