In [6]:
import numpy as np 
import pandas as pd
from sklearn.datasets import load_iris 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.metrics import accuracy_score

# --------------------- Step 1: Data Preparation & ASM Functions --------------------- 

#Load the Iris dataset 
iris = load_iris()
X = iris.data # Features
y = iris.target # Class Labels

# Function to compute entropy of a set of labels. labels: array-like of class labels. Returns: entropy value.
def entropy(labels):
    _, counts = np.unique(labels, return_counts=True)
    probabilities = counts / counts.sum()
    return -np.sum(probabilities * np.log2(probabilities + 1e-9)) # Avoid log(0) with 1e-9 

# Function to compute Gini index of a set of labels 
def gini_index(labels): 
    _, counts = np.unique(labels, return_counts=True)
    probabilities = counts / counts.sum()
    return 1 - np.sum(probabilities ** 2)

#Function to compute Information Gain for a split 
def information_gain(parents_labels, left_labels, right_labels):
    parent_entropy = entropy(parents_labels)
    n = len(parents_labels)
    n_left = len(left_labels)
    n_right = len(right_labels)
    weighted_entropy = weighted_entropy = (n_left/n) * entropy(left_labels) + (n_right/n) * entropy(right_labels)
    return parent_entropy - weighted_entropy
#Function to compute split information 
def split_information(left_labels, right_labels):
    n_left = len(left_labels)
    n_right = len(right_labels)
    total = n_left + n_right 
    p_left = n_left / total 
    p_right = n_right / total 
    si = 0 
    if p_left > 0:
        si -= p_left * np.log2(p_left +1e-9)
    if p_right > 0:
        si -= p_right * np.log2(p_right + 1e-9)
    return si 
    
# Function to compute Gain Ratio
def gain_ratio(parent_labels, left_labels, right_labels):
    ig = information_gain(parent_labels, left_labels, right_labels)
    si = split_information(left_labels, right_labels)
    if si == 0:
        return 0
    return ig / si

# --------------------- Step 2: Explore a Sample Split ---------------------


# For demonstration, choose feature index 0 (sepal length) and a threshold (e.g., 5.5)
feature_index = 0
threshold = 5.5

# Partition the dataset based on the threshold
left_mask = X[:, feature_index] < threshold
right_mask = ~left_mask
left_labels = y[left_mask]
right_labels = y[right_mask]

#Compute ASMs for this split
parent_ent = entropy(y)
ig = information_gain(y, left_labels, right_labels)
si = split_information(left_labels, right_labels)
gr = gain_ratio(y, left_labels, right_labels)
gini_parent = gini_index(y)
gini_left = gini_index(left_labels)
gini_right = gini_index(right_labels)

print("Sample Split on Iris Dataset (Feature: Sepal Length, Threshold: 5.5)")
print("Parent Entropy: {:.3f}".format(parent_ent))
print("Information Gain: {:.3f}".format(ig))
print("Split Information: {:.3f}".format(si))
print("Gain Ratio: {:.3f}".format(gr))
print("Parent Gini Index: {:.3f}".format(gini_parent))
print("Left Gini Index: {:.3f}".format(gini_left))
print("Right Gini Index: {:.3f}".format(gini_right))
print("\n")

# --------------------- Step 3: Train and Compare Decision Tree Classifiers ---------------------

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a decision tree classifier using entropy (which uses information gain)
dt_entropy = DecisionTreeClassifier(criterion='entropy', random_state=42)
dt_entropy.fit(X_train, y_train)
y_pred_entropy = dt_entropy.predict(X_test)
acc_entropy = accuracy_score(y_test, y_pred_entropy)

# Train a decision tree classifier using Gini index
dt_gini = DecisionTreeClassifier(criterion='gini', random_state=42)
dt_gini.fit(X_train, y_train)
y_pred_gini = dt_gini.predict(X_test)
acc_gini = accuracy_score(y_test, y_pred_gini)

print("Decision Tree using Entropy (Information Gain) - Accuracy: {:.3f}".format(acc_entropy))
print("Decision Tree using Gini Index - Accuracy: {:.3f}".format(acc_gini))


Sample Split on Iris Dataset (Feature: Sepal Length, Threshold: 5.5)
Parent Entropy: 1.585
Information Gain: 0.551
Split Information: 0.931
Gain Ratio: 0.592
Parent Gini Index: 0.667
Left Gini Index: 0.237
Right Gini Index: 0.546


Decision Tree using Entropy (Information Gain) - Accuracy: 1.000
Decision Tree using Gini Index - Accuracy: 1.000
