In [2]:

import pandas as pd
import numpy as np

In [3]:
data = {
    'AGE': ['Young','Young','Young','Young','Young','Middle','Middle','Middle','Middle','Middle',
            'Old','Old','Old','Old','Old'],
    'JOB_STATUS': [False,False,True,True,False,False,False,True,False,False,
                   False,False,True,True,False],
    'OWNS_HOUSE': [False,False,False,True,False,False,False,True,True,True,
                   True,True,False,False,False],
    'CREDIT_RATING': ['Fair','Good','Good','Fair','Fair','Fair','Good','Good','Excellent','Excellent',
                      'Excellent','Good','Good','Excellent','Fair'],
    'CLASS': ['No','No','Yes','Yes','No','No','No','Yes','Yes','Yes',
              'Yes','Yes','Yes','Yes','No']
}

df = pd.DataFrame(data)
df

Unnamed: 0,AGE,JOB_STATUS,OWNS_HOUSE,CREDIT_RATING,CLASS
0,Young,False,False,Fair,No
1,Young,False,False,Good,No
2,Young,True,False,Good,Yes
3,Young,True,True,Fair,Yes
4,Young,False,False,Fair,No
5,Middle,False,False,Fair,No
6,Middle,False,False,Good,No
7,Middle,True,True,Good,Yes
8,Middle,False,True,Excellent,Yes
9,Middle,False,True,Excellent,Yes


In [4]:
# Function to calculate entropy
def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = np.sum([(-counts[i]/np.sum(counts)) * np.log2(counts[i]/np.sum(counts))
                      for i in range(len(elements))])
    return entropy

# Function to calculate Information Gain
def info_gain(data, split_attribute_name, target_name="CLASS"):
    # Total entropy before splitting
    total_entropy = entropy(data[target_name])
    
    # Values and counts for the split attribute
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    
    # Weighted entropy after splitting
    weighted_entropy = np.sum([(counts[i]/np.sum(counts)) * 
                               entropy(data.where(data[split_attribute_name] == vals[i]).dropna()[target_name])
                               for i in range(len(vals))])
    
    # Information Gain
    info_gain = total_entropy - weighted_entropy
    return info_gain


In [5]:
attributes = ['AGE', 'JOB_STATUS', 'OWNS_HOUSE', 'CREDIT_RATING']
gains = {}

print("Information Gain for each attribute:\n")

for attr in attributes:
    gains[attr] = info_gain(df, attr)
    print(f"{attr}: {gains[attr]:.4f}")

# Determine root node
root_node = max(gains, key=gains.get)
print("\nRoot Node:", root_node)

Information Gain for each attribute:

AGE: 0.0830
JOB_STATUS: 0.3237
OWNS_HOUSE: 0.4200
CREDIT_RATING: 0.3630

Root Node: OWNS_HOUSE
