In [71]:
import numpy as np
import pandas as pd
import math

In [72]:
df = pd.read_csv('datasets/q2_data.csv')

In [73]:
df

Unnamed: 0,Id,Age,Blood_Pressure,Cholesterol,Diagnosis
0,1,30,High,High,Sick
1,2,45,Low,Normal,Healthy
2,3,50,High,High,Sick
3,4,35,Low,Normal,Healthy
4,5,60,High,High,Sick
5,6,55,Low,Normal,Healthy
6,7,40,High,High,Sick
7,8,25,Low,Normal,Healthy
8,9,65,High,High,Sick
9,10,45,Low,Normal,Healthy


In [74]:
def entropy(data):
    total = len(data)
    if total == 0:
        return 0
    counts = data['Diagnosis'].value_counts()
    entropy = 0
    for count in counts:
        p = count / total
        entropy -= p * math.log2(p)
    return entropy

def info_gain(df, feature):
    total_entropy = entropy(df)
    values = df[feature].unique()
    weighted_entropy = 0
    for val in values:
        subset = df[df[feature] == val]
        weighted_entropy += (len(subset) / len(df)) * entropy(subset)
    return total_entropy - weighted_entropy

In [75]:
print("Entropy(Diagnosis):", entropy(df))

Entropy(Diagnosis): 1.0


In [76]:
for feature in ['Age', 'Blood_Pressure', 'Cholesterol']:
    if feature == 'Age':
        df['AgeGroup'] = df['Age'].apply(lambda x: '<=45' if x <= 45 else '>45')
        ig = info_gain(df, 'AgeGroup')
        print(f"Information Gain ({feature}):", ig)
    else:
        ig = info_gain(df, feature)
        print(f"Information Gain ({feature}):", ig)

Information Gain (Age): 0.12451124978365313
Information Gain (Blood_Pressure): 1.0
Information Gain (Cholesterol): 1.0


In [77]:
class Node:
    def __init__(self, feature):
        self.feature = feature
        self.children = {}



def recursion(data, cols):
    if len(cols) == 0:
        return Node(data['Diagnosis'].value_counts().idxmax())
    info_gains = [info_gain(data, col) for col in cols]
    ftr_idx = np.argmax(info_gains)
    feature = cols[ftr_idx]
    tree = Node(feature)
    
    # Children nodes
    cols.remove(feature)
    for child in data[feature].unique():
        df_child = df[df[feature] == child]
        tree.children[child] = recursion(df_child ,cols.copy())
    
    return tree


In [78]:
def predict(tree, data):
    while True:
        if len(tree.children) == 0:
            print(tree.feature)
            break
        feature = tree.feature
        tree = tree.children[data[feature]]

In [79]:
data = {'Blood_Pressure': 'Low', 'Cholesterol': 'Normal', 'AgeGroup': '>45'}
predict(tree, data)

Sick
