# Q1: Information Gain for Splitting on CreditScore at 650

In [1]:
import numpy as np
import pandas as pd

def entropy(labels):
    """Compute the entropy of a list of labels."""
    # Get unique classes and their counts
    values, counts = np.unique(labels, return_counts=True)
    probabilities = counts / counts.sum()
    return -np.sum(probabilities * np.log2(probabilities))

# Create the training dataset
data = {
    'ID': [1, 2, 3, 4, 5, 6, 7, 8],
    'Age': [35, 28, 45, 31, 52, 29, 42, 33],
    'CreditScore': [720, 650, 750, 600, 780, 630, 710, 640],
    'Education': [16, 14, None, 12, 18, 14, 16, 12],
    'RiskLevel': ['Low', 'High', 'Low', 'High', 'Low', 'High', 'Low', 'High']
}

df = pd.DataFrame(data)

# Calculate the entropy of the parent node
parent_entropy = entropy(df['RiskLevel'])
print("Parent Entropy:", parent_entropy)  

# Split the dataset based on CreditScore at 650
group_A = df[df['CreditScore'] >= 650]  # Group A: CreditScore >= 650
group_B = df[df['CreditScore'] < 650]   # Group B: CreditScore < 650

entropy_A = entropy(group_A['RiskLevel'])
entropy_B = entropy(group_B['RiskLevel'])

print("Entropy of Group A (CreditScore >= 650):", entropy_A)
print("Entropy of Group B (CreditScore < 650):", entropy_B)

# Calculate the weighted entropy after the split
n = len(df)
weighted_entropy = (len(group_A) / n) * entropy_A + (len(group_B) / n) * entropy_B
print("Weighted Entropy after split:", weighted_entropy)

# Information Gain calculation
information_gain = parent_entropy - weighted_entropy
print("Information Gain:", information_gain)

Parent Entropy: 1.0
Entropy of Group A (CreditScore >= 650): 0.7219280948873623
Entropy of Group B (CreditScore < 650): -0.0
Weighted Entropy after split: 0.4512050593046014
Information Gain: 0.5487949406953986
