In [None]:
import math

def calculate_entropy(data):
    """
    计算给定数据集的信息熵
    参数：
    data: 包含类标签的数据集，例如 [(feature1, feature2, ..., class_label), ...]
    
    返回值：
    entropy: 数据集的信息熵
    """
    num_samples = len(data)
    if num_samples == 0:
        return 0  # 数据集为空时，熵为0

    # 统计每个类别的样本数
    class_counts = {}
    for sample in data:
        class_label = sample[-1]
        if class_label not in class_counts:
            class_counts[class_label] = 0
        class_counts[class_label] += 1

    # 计算信息熵
    entropy = 0
    for class_label, count in class_counts.items():
        probability = count / num_samples
        entropy -= probability * math.log2(probability)

    return entropy

def calculate_information_gain(data, feature_index):
    """
    计算使用指定特征(feature_index)划分数据集后的信息增益
    参数：
    data: 包含类标签的数据集，例如 [(feature1, feature2, ..., class_label), ...]
    feature_index: 用于划分数据集的特征的索引
    
    返回值：
    information_gain: 信息增益
    """
    base_entropy = calculate_entropy(data)
    num_samples = len(data)
    
    # 按照指定特征划分数据集
    feature_values = set([sample[feature_index] for sample in data])
    new_entropy = 0
    
    for value in feature_values:
        sub_data = [sample for sample in data if sample[feature_index] == value]
        probability = len(sub_data) / num_samples
        new_entropy += probability * calculate_entropy(sub_data)
    
    information_gain = base_entropy - new_entropy
    return information_gain

# 示例用法：
data = [(1, 'Sunny', 'No'), (2, 'Overcast', 'Yes'), (3, 'Sunny', 'Yes'),
        (4, 'Rainy', 'No'), (5, 'Rainy', 'No'), (6, 'Overcast', 'Yes'),
        (7, 'Sunny', 'No'), (8, 'Sunny', 'Yes'), (9, 'Rainy', 'No'),
        (10, 'Sunny', 'Yes')]

# 假设最后一列是类别标签
feature_index = 1  # 使用第二列特征 'Outlook' 来计算信息增益
information_gain = calculate_information_gain(data, feature_index)
print("信息增益:", information_gain)
