# Q2 A. Medical Diagnosis Decision

A dataset is provided to classify patients as "Healthy" or "Sick" based on their Age, Blood Pressure, and
Cholesterol levels.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [11]:
data = {
    'age': [30, 45, 50, 35, 60, 55, 40, 25, 65, 45],
    'bp': ['high', 'low', 'high', 'low', 'high', 'low', 'high', 'low', 'high', 'low'],
    'cholesterol': ['high', 'normal', 'high', 'normal', 'high', 'normal', 'high', 'normal', 'high', 'normal'],
    'diagnosis' : ['sick', 'healthy', 'sick', 'healthy', 'sick', 'healthy', 'sick', 'healthy', 'sick', 'healthy'],
}

df = pd.DataFrame(data)
df

Unnamed: 0,age,bp,cholesterol,diagnosis
0,30,high,high,sick
1,45,low,normal,healthy
2,50,high,high,sick
3,35,low,normal,healthy
4,60,high,high,sick
5,55,low,normal,healthy
6,40,high,high,sick
7,25,low,normal,healthy
8,65,high,high,sick
9,45,low,normal,healthy


In [12]:
def calculate_entropy(target):
    # Calculate the probability of each class
    probabilities = target.value_counts(normalize=True)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

In [13]:
entropy_diagnosis = calculate_entropy(df['diagnosis'])
print(f"Entropy of Diagnosis: {entropy_diagnosis}")

Entropy of Diagnosis: 1.0


In [14]:
def calculate_information_gain(df, feature, target):
    total_entropy = calculate_entropy(df[target])
    
    feature_entropy = 0
    for value in df[feature].unique():
        subset = df[df[feature] == value]
        weight = len(subset) / len(df)
        feature_entropy += weight * calculate_entropy(subset[target])
    
    information_gain = total_entropy - feature_entropy
    return information_gain

In [15]:
features = ['age', 'bp', 'cholesterol']
information_gains = {feature: calculate_information_gain(df, feature, 'diagnosis') for feature in features}

print("Information Gains:")
for feature, gain in information_gains.items():
    print(f"{feature}: {gain}")

Information Gains:
age: 1.0
bp: 1.0
cholesterol: 1.0


In [16]:
best_feature = max(information_gains, key=information_gains.get)
print(f"Best feature to split on: {best_feature}")

Best feature to split on: age


In [17]:
class Node:
    def __init__(self, feature=None, value=None, left=None, right=None, prediction=None):
        self.feature = feature
        self.value = value
        self.left = left
        self.right = right
        self.prediction = prediction

def build_tree(df, target):
    if len(df[target].unique()) == 1:
        return Node(prediction=df[target].iloc[0])
    
    if len(df.columns) == 1:
        return Node(prediction=df[target].mode()[0])
    
    best_feature = max(features, key=lambda feature: calculate_information_gain(df, feature, target))
    tree = Node(feature=best_feature)
    
    for value in df[best_feature].unique():
        subset = df[df[best_feature] == value]
        tree_value = build_tree(subset.drop(columns=[best_feature]), target)
        if tree_value is None:
            continue
        if tree.left is None:
            tree.left = tree_value
        else:
            tree.right = tree_value
            
    return tree

decision_tree = build_tree(df, 'diagnosis')

In [18]:
def predict(tree, sample):
    if tree.prediction is not None:
        return tree.prediction
    
    feature_value = sample[tree.feature]
    
    if feature_value == 'low':
        return predict(tree.left, sample)
    else:
        return predict(tree.right, sample)

In [19]:
new_patient = {'age': 50, 'bp': 'low', 'cholesterol': 'normal'}
result = predict(decision_tree, new_patient)
print(f"The prediction for the new patient is: {result}")

The prediction for the new patient is: sick
