# Lab 2: Decision Trees and Information Gain

In [None]:
import pandas as pd
import numpy as np

## 1. Entropy Function

This function calculates the entropy of a given dataset based on the target column 'hospital_death'.
Entropy is a measure of impurity or disorder in a dataset.

**Formula:** E = - sum(p_i * log2(p_i)) for all classes i.

For a binary classification problem with target 'hospital_death' (classes 0 and 1):
E = - (p_0 * log2(p_0) + p_1 * log2(p_1))
where p_0 is the proportion of class 0 (e.g., not survived) and p_1 is the proportion of class 1 (e.g., survived).

If a proportion p_i is 0, then p_i * log2(p_i) is taken as 0.

In [None]:
def entropy(data):
    """
    Calculates the entropy of a dataset based on the 'hospital_death' target column.

    Args:
        data (pd.DataFrame): DataFrame containing the dataset with a 'hospital_death' column.

    Returns:
        float: The calculated entropy value.
               Returns 0 if the dataset is empty or the target column has no variance (all same class).
    """
    target_column = 'hospital_death'
    
    if data.empty or target_column not in data.columns:
        # print("Warning: Data is empty or target column missing for entropy calculation.")
        return 0.0
    
    counts = data[target_column].value_counts()
    total_count = len(data[target_column])
    
    if total_count == 0 or len(counts) <= 1: # No variance or empty
        # print("Warning: Total count is 0 or only one class present for entropy calculation.")
        return 0.0
        
    entropy_value = 0.0
    for count_val in counts:
        proportion = count_val / total_count
        if proportion > 0: # log2(0) is undefined, but 0 * log2(0) limit is 0
            entropy_value -= proportion * np.log2(proportion)
            
    return entropy_value

### Example Usage for Entropy

In [None]:
# Create a sample DataFrame
sample_data_for_lab2 = {
    'age': [25, 30, 35, 40, 45, 50, 55, 60],
    'gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'],
    'hospital_death': [0, 1, 0, 0, 1, 1, 0, 1] # 4 survived (1), 4 not survived (0)
}
sample_df_lab2 = pd.DataFrame(sample_data_for_lab2)

entropy_val_lab2 = entropy(sample_df_lab2)
print(f"Sample DataFrame for Lab 2:\n{sample_df_lab2}")
print(f"\nCalculated Entropy for sample_df_lab2: {entropy_val_lab2:.4f}") # Should be 1.0 for a 50/50 split

# Example with all same class (entropy should be 0)
sample_data_all_same = {
    'hospital_death': [0, 0, 0, 0]
}
sample_df_all_same = pd.DataFrame(sample_data_all_same)
entropy_all_same = entropy(sample_df_all_same)
print(f"\nSample DataFrame (all same class):\n{sample_df_all_same}")
print(f"Calculated Entropy (all same class): {entropy_all_same:.4f}")

# Example with empty data
empty_df = pd.DataFrame({'hospital_death': []})
entropy_empty = entropy(empty_df)
print(f"\nSample DataFrame (empty):\n{empty_df}")
print(f"Calculated Entropy (empty): {entropy_empty:.4f}")

## 2. Information Gain Function

This function calculates the information gain of a split on a dataset. 
Information gain is the reduction in entropy achieved by partitioning the data according to a given attribute (represented by the mask).

**Formula:** IG(S, A) = Entropy(S) - sum over v in Values(A) ( |S_v| / |S| * Entropy(S_v) )

Where:
- S is the original dataset.
- A is the attribute (or mask) used for splitting.
- Values(A) are the possible values of attribute A (here, True/False from the mask representing the two subsets).
- S_v is the subset of S for which attribute A has value v.
- |S_v| is the number of instances in subset S_v.
- |S| is the total number of instances in S.

For a binary split (left and right subsets based on the mask):
IG = Entropy(parent) - [ (weight_left * Entropy(left_subset)) + (weight_right * Entropy(right_subset)) ]

In [None]:
def information_gain(data, mask):
    """
    Calculates the information gain from partitioning the data using a boolean mask.

    Args:
        data (pd.DataFrame): The dataset (parent node) containing 'hospital_death' column.
        mask (pd.Series): A boolean Series of the same length as data.
                          True indicates the instance goes to the left subset,
                          False indicates the instance goes to the right subset.

    Returns:
        float: The calculated information gain.
    """
    if data.empty:
        # print("Warning: Data is empty for information gain calculation.")
        return 0.0

    parent_entropy = entropy(data)
    
    left_subset = data[mask]
    right_subset = data[~mask]
    
    total_count = len(data)
    count_left = len(left_subset)
    count_right = len(right_subset)
    
    if total_count == 0: # Should be caught by data.empty, but as a safeguard
        # print("Warning: Total count is 0 in information gain.")
        return 0.0 
    
    # Handle cases where a subset might be empty, its entropy is 0 and its weight will correctly make its contribution 0.
    entropy_left = entropy(left_subset)
    entropy_right = entropy(right_subset)
    
    weight_left = count_left / total_count if total_count > 0 else 0
    weight_right = count_right / total_count if total_count > 0 else 0
    
    weighted_children_entropy = (weight_left * entropy_left) + (weight_right * entropy_right)
    
    ig = parent_entropy - weighted_children_entropy
    
    return ig

### Example Usage for Information Gain

In [None]:
# Using sample_df_lab2 from the entropy example
print(f"Original DataFrame for Information Gain Example:\n{sample_df_lab2}")
parent_entropy_for_ig_example = entropy(sample_df_lab2)
print(f"Parent Entropy: {parent_entropy_for_ig_example:.4f}")

# Create a sample mask, e.g., based on 'age' > 37
sample_mask = sample_df_lab2['age'] > 37
print(f"\nSample Mask (age > 37):\n{sample_mask.values}")

left_subset_example = sample_df_lab2[sample_mask]
right_subset_example = sample_df_lab2[~sample_mask]

print(f"\nLeft Subset (age > 37):\n{left_subset_example}")
entropy_left_example = entropy(left_subset_example)
print(f"Entropy of Left Subset: {entropy_left_example:.4f}")

print(f"\nRight Subset (age <= 37):\n{right_subset_example}")
entropy_right_example = entropy(right_subset_example)
print(f"Entropy of Right Subset: {entropy_right_example:.4f}")

ig_value = information_gain(sample_df_lab2, sample_mask)
print(f"\nCalculated Information Gain for 'age > 37': {ig_value:.4f}")

# Another mask example: gender == 'Male'
sample_mask_gender = sample_df_lab2['gender'] == 'Male'
print(f"\nSample Mask (gender == 'Male'):\n{sample_mask_gender.values}")
ig_value_gender = information_gain(sample_df_lab2, sample_mask_gender)
print(f"Calculated Information Gain for 'gender == \"Male\"': {ig_value_gender:.4f}")

# Example with a mask that perfectly separates classes (if possible with this data)
# For sample_df_lab2, 'hospital_death' is [0, 1, 0, 0, 1, 1, 0, 1]
# Let's try a mask that might give high IG, e.g., age > 40 (targets: [1,1,0,1]) vs age <=40 (targets: [0,1,0,0])
perfect_ish_mask = sample_df_lab2['age'] > 40
print(f"\nSample Mask (age > 40):\n{perfect_ish_mask.values}")
ig_value_perfect_ish = information_gain(sample_df_lab2, perfect_ish_mask)
left_subset_p = sample_df_lab2[perfect_ish_mask]
right_subset_p = sample_df_lab2[~perfect_ish_mask]
print(f"Left (age > 40) 'hospital_death': {left_subset_p['hospital_death'].tolist()}, Entropy: {entropy(left_subset_p):.4f}")
print(f"Right (age <= 40) 'hospital_death': {right_subset_p['hospital_death'].tolist()}, Entropy: {entropy(right_subset_p):.4f}")
print(f"Calculated Information Gain for 'age > 40': {ig_value_perfect_ish:.4f}")

# Example with an empty dataframe for information_gain
ig_empty = information_gain(empty_df, pd.Series([], dtype=bool))
print(f"\nInformation Gain for empty DataFrame: {ig_empty:.4f}")

## 3. Find Best Split Function (Basic Implementation)

This function iterates through all features (excluding the target 'hospital_death') and their possible split points to find the split that maximizes information gain.

For the 'basic' implementation:
- It processes features that are numeric (or can be meaningfully compared for thresholds).
- For each such feature, it sorts its unique values.
- Potential thresholds are calculated as the midpoint between consecutive unique sorted values.
- It uses the `information_gain` function to evaluate each split.
- It returns the best information gain found, along with the corresponding feature and threshold.

In [None]:
def find_best_split(data, impl_part='basic'):
    """
    Finds the best feature and threshold to split the data on, maximizing information gain.
    'basic' implementation: Considers only numeric features for splitting.

    Args:
        data (pd.DataFrame): The dataset, including 'hospital_death' target column.
        impl_part (str): Implementation part, currently 'basic'.

    Returns:
        tuple: (best_ig, best_threshold, best_feature)
               Returns (-1, None, None) if no valid split is found or data is unsuitable.
    """
    best_ig = -1.0  # Initialize with a value lower than any possible IG
    best_threshold = None
    best_feature = None

    if data.empty or 'hospital_death' not in data.columns or len(data) < 2:
        return best_ig, best_threshold, best_feature

    # Exclude the target column from features to split on
    features = data.columns.drop('hospital_death', errors='ignore')

    for feature in features:
        # For 'basic' part, only consider numeric features for threshold splitting
        if not pd.api.types.is_numeric_dtype(data[feature]):
            # print(f"Skipping non-numeric feature '{feature}' in basic mode.")
            continue
            
        unique_values = sorted(data[feature].unique())
        
        if len(unique_values) < 2: # Cannot split if only one unique value
            continue

        for i in range(len(unique_values) - 1):
            val1 = unique_values[i]
            val2 = unique_values[i+1]
            
            # This check is mostly for safety, unique_values should not have duplicates here
            if val1 == val2:
                continue
                
            threshold = (val1 + val2) / 2.0
            
            # Create mask based on the current feature and threshold
            mask = data[feature] <= threshold
            
            # If mask results in all True or all False, it's not a useful split
            if mask.all() or (~mask).all():
                continue

            current_ig = information_gain(data, mask)

            if current_ig > best_ig:
                best_ig = current_ig
                best_threshold = threshold
                best_feature = feature
                
    return best_ig, best_threshold, best_feature

### Example Usage for find_best_split (Basic)

In [None]:
print("Using sample_df_lab2 for find_best_split example:")
print(sample_df_lab2)

best_ig_basic, best_thresh_basic, best_feat_basic = find_best_split(sample_df_lab2, 'basic')

print(f"\nBest Information Gain (Basic): {best_ig_basic:.4f}")
print(f"Best Threshold (Basic): {best_thresh_basic}")
print(f"Best Feature (Basic): {best_feat_basic}")

# Example with a slightly more complex numeric dataset
complex_data = {
    'feature1': [10, 15, 20, 25, 30, 35, 40, 45, 50, 55], # Clear split potential
    'feature2': [1, 1, 0, 0, 1, 1, 0, 0, 1, 1], # Less clear
    'hospital_death': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1] # Target split at midpoint of feature1
}
complex_df = pd.DataFrame(complex_data)
print(f"\nComplex DataFrame:\n{complex_df}")

best_ig_complex, best_thresh_complex, best_feat_complex = find_best_split(complex_df, 'basic')
print(f"\nBest Information Gain (Complex): {best_ig_complex:.4f}")
print(f"Best Threshold (Complex): {best_thresh_complex}")
print(f"Best Feature (Complex): {best_feat_complex}")

# Example with no possible split yielding positive IG (or only one class in data)
no_gain_data = {
    'feature1': [10, 20, 30],
    'hospital_death': [0, 0, 0]
}
no_gain_df = pd.DataFrame(no_gain_data)
print(f"\nNo Gain DataFrame:\n{no_gain_df}")
best_ig_no_gain, _, _ = find_best_split(no_gain_df, 'basic')
print(f"Best IG (No Gain Data): {best_ig_no_gain:.4f}") # Parent entropy is 0, so IG will be 0

# Example with empty data for find_best_split
best_ig_empty_fbs, _, _ = find_best_split(empty_df, 'basic')
print(f"\nBest IG (Empty Data for find_best_split): {best_ig_empty_fbs:.4f}")