In [1]:
import numpy as np
from mnist import Mnist

### 经验熵：H(D) = Sum(|C_k|/|D| * log2(|C_k|/|D|)))
### 信息增益：H(D) - H(D|A)，
### 信息纯度：Gini(D) = 1 - (sum(|C_k|/|D|))^2
- ID3：通过信息增益来进行分支，基础公式：
- C4.5: 通过信息增益比
- CART:Gini

### H(D|A) = Sum(D_I/D * H(D)), D_I表示特征A取值为I的样本子集

In [2]:
class Node:
    def __init__(self, attribute=None, value=None, label=None):
        self.attribute = attribute
        self.value = value # 分裂属性的取值
        self.label = label # 叶子节点的类标签
        self.children = {} # 子节点的字典，格式为属性值：子节点

In [3]:
class DecisionTreeID3:
    def __init__(self):
        self.root = None

    def fit(self, X, y):
        self.root = self._build_tree(X,y)

    def _build_tree(self, X, y):
        attributes = list(range(X.shape[1])) # 获取所有属性的索引表

        # 递归构建决策树
        return self._recursive_build_tree(X, y, attributes)

    def _recursive_build_tree(self, X, y, attributes):
        node = Node()

        # 所有的样本属于同一类别，直接设置叶子节点的类标签
        if np.all(y == y[0]):
            node.label = y[0]
            return node

        # 若属性列表为空，则叶子节点的类标签为样本中最多的类别
        if not attributes:
            node.label = np.argmax(np.bincount(y))
            return node

        best_attr, best_value = self._choose_best_attr(X,y, attributes)

        # 设置当前节点的分裂属性和取值
        node.attribute = best_attr
        node.value = best_value

        attr_col = X[:, best_attr]
        unique_values = np.unique(attr_col)

        for value in unique_values:
            # 选取分裂属性为best_attr,取值为value的样本
            mask = attr_col == value
            X_subset, y_subset = X[mask], y[mask]

            # 从属性列表中移除best_attr，递归构建子树
            attributes_subset = attributes.copy()
            attributes_subset.remove(best_attr)

            child_node = self._recursive_build_tree(X_subset, y_subset, attributes_subset)
            node.chilren[value] = child_node

        return node

    def _choose_best_attr(self, X, y, attributes):
        best_attr = None
        best_value = None

        best_info_gain = -np.inf

        # 计算初始信息熵
        initial_entropy = self._calculate_entropy(y)

        # 计算每个属性的信息增益，选择最大的信息增益
        for attr in attributes:
            attr_col = X[:, attr]
            unique_values = np.unique(attr_col)
            for value in unique_values:
                # 根据属性和取值划分数据集
                mask = attr_col == value
                X_subset, y_subset = X[mask], y[mask]

                # 计算划分后的加权信息熵
                subset_entropy = self._calculate_entropy(y_subset)

                info_gain = initial_entropy - subset_entropy

                # 计算信息增益
                if info_gain > best_info_gain:
                    best_info_gain = best_info_gain
                    best_attr = attr
                    best_value = value

        return best_attr, best_value

    def _calculate_entropy(self, y):
        # 计算给定标签的信息熵
        class_cnt = np.bincount(y)
        class_probs = class_cnt / len(y)
        entropy = -np.sum(class_probs * np.log2(class_probs + 1e-8))
        return entropy

    def _traverse_tree(self, node:Node, instance):
        # 遍历决策树，根据实例的属性值预测类别
        if node.label is not None:
            return node.label

        attr_value = instance[node.attribute]
        if attr_value in node.children:
            chile_node = node.children[attr_value]
            return self._traverse_tree(chile_node, instance)

        return np.random.choice(np.unique(instance))