In [3]:
import numpy as np
from mnist import Mnist

In [4]:
dataloadeer = Mnist()
train_data, train_label = dataloadeer.get_data()
test_data, test_label = dataloadeer.get_data(False)

### 经验熵：H(D) = Sum(|C_k|/|D| * log2(|C_k|/|D|)))
### 信息增益：H(D) - H(D|A)，
### 信息纯度：Gini(D) = 1 - (sum(|C_k|/|D|))^2
- ID3：通过信息增益来进行分支，基础公式：
- C4.5: 通过信息增益比
- CART:Gini

In [4]:
def cal_entropy(label_arr):
    entropy = 0
    label_set = set(label_arr)
    for i in label_set:
        p = label_arr[label_arr == i].size / len(label_arr) # C_K / D
        entropy += -1 * p * np.log2(p)
    return entropy

### H(D|A) = Sum(D_I/D * H(D)), D_I表示特征A取值为I的样本子集

In [8]:
def cal_conditional_entroy(feature_array:np.array, label_array:np.array):
    H_D_A = 0

    label_set = set(feature_array)
    for i in label_set:
        H_D_i = cal_entropy(label_array[feature_array == i])
        H_D_A += H_D_i * feature_array[feature_array == i].size / feature_array.size

    return H_D_A

In [9]:
def cal_info_gain(feature_array, label_array):
    return cal_entropy(label_array) - cal_conditional_entroy(feature_array, label_array)

In [2]:
class Node:
    def __init__(self, attribute=None, value=None, label=None):
        self.attribute = attribute
        self.value = value # 分裂属性的取值
        self.label = label # 叶子节点的类标签
        self.children = {} # 子节点的字典，格式为属性值：子节点

In [None]:
class DecisionTreeID3:
    def __init__(self):
        self.root = None

    def fit(self, X, y):
        self.root = self._build_tree(X,y)

    def _build_tree(self, X, y):
        attributes = list(range(X.shape[1])) # 获取所有属性的索引表

        # 递归构建决策树
        return self._recursive_build_tree(X, y, attributes)

    def _recursive_build_tree(self, X, y, attributes):
        node = Node()

        # 所有的样本属于同一类别，直接设置叶子节点的类标签
        if np.all(y == y[0]):
            node.label = y[0]
            return node

        # 若属性列表为空，则叶子节点的类标签为样本中最多的类别
        if not attributes:
            node.label = np.argmax(np.bincount(y))
            return node

        best_attr, best_value = self._choose_best_attr(X,y, attributes)

        # 设置当前节点的分裂属性和取值
        node.attribute = best_attr
        node.value = best_value

        attr_col = X[:, best_attr]
        unique_values = np.unique(attr_col)

        for value in unique_values:
            # 选取分裂属性为best_attr,取值为value的样本
            mask = attr_col == value
            X_subset, y_subset = X[mask], y[mask]

            # 从属性列表中移除best_attr，递归构建子树
            attributes_subset = attributes.copy()
            attributes_subset.remove(best_attr)

            child_node = self._recursive_build_tree(X_subset, y_subset, attributes_subset)
            node.chilren[value] = child_node

        return node