In [27]:
import pandas as pd
import random
import numpy as np

random.seed(42)
np.random.seed(42)

In [28]:
train_df = pd.read_csv('iris.data', header=None)
train_df.rename(columns={4:'class'}, inplace=True)
train_df.head()

Unnamed: 0,0,1,2,3,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [29]:
from sklearn.model_selection import train_test_split

X = train_df.drop('class', axis=1)
y = train_df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3)

In [33]:
import numpy as np

class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index  # 分割特征的索引
        self.threshold = threshold          # 分割阈值
        self.left = left                    # 左子节点
        self.right = right                  # 右子节点
        self.value = value                  # 如果节点是叶子节点，则为其值

class DecisionTree:
    def __init__(self, max_depth=None, criterion='id3'):
        self.max_depth = max_depth    # 树的最大深度
        self.criterion = criterion    # 分割标准（id3或c45）
        
    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        self.n_features = X.shape[1]
        self.tree_ = self._grow_tree(X, y)
        
    def _calculate_entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy
        
    def _calculate_information_gain(self, X, y, feature_index, threshold):
        # 根据阈值分割数据
        left_mask = X[:, feature_index] <= threshold
        right_mask = ~left_mask
        left_y, right_y = y[left_mask], y[right_mask]
        
        # 计算父节点的熵
        parent_entropy = self._calculate_entropy(y)
        
        # 计算子节点的熵
        left_entropy = self._calculate_entropy(left_y)
        right_entropy = self._calculate_entropy(right_y)
        
        # 计算信息增益
        n = len(y)
        child_entropy = (len(left_y) / n) * left_entropy + (len(right_y) / n) * right_entropy
        information_gain = parent_entropy - child_entropy
        return information_gain
        
    def _find_best_split(self, X, y):
        best_gain = -1
        best_feature_index = None
        best_threshold = None
        
        for feature_index in range(self.n_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                gain = self._calculate_information_gain(X, y, feature_index, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature_index = feature_index
                    best_threshold = threshold
        
        return best_feature_index, best_threshold
    
    def _grow_tree(self, X, y, depth=0):
        if len(np.unique(y)) == 1:  # 如果只剩下一个类别，返回叶子节点
            return Node(value=np.unique(y)[0])
        
        if self.max_depth is not None and depth >= self.max_depth:  # 如果达到最大深度，返回叶子节点
            return Node(value=np.argmax(np.bincount(y)))
        
        if len(X) == 0:  # 如果没有样本了，返回叶子节点
            return Node(value=np.argmax(np.bincount(y)))
        
        # 寻找最佳分割
        best_feature_index, best_threshold = self._find_best_split(X, y)
        if best_feature_index is None:  # 如果找不到分割点，返回叶子节点
            return Node(value=np.argmax(np.bincount(y)))
        
        # 根据最佳分割点分割数据
        left_mask = X[:, best_feature_index] <= best_threshold
        right_mask = ~left_mask
        left_X, left_y = X[left_mask], y[left_mask]
        right_X, right_y = X[right_mask], y[right_mask]
        
        # 递归生长左右子树
        left_node = self._grow_tree(left_X, left_y, depth + 1)
        right_node = self._grow_tree(right_X, right_y, depth + 1)
        
        # 返回当前节点
        return Node(feature_index=best_feature_index, threshold=best_threshold, left=left_node, right=right_node)
        
    def predict(self, X):
        return np.array([self._predict_tree(x, self.tree_) for x in X])
    
    def _predict_tree(self, x, node):
        if node.value is not None:  # 如果是叶子节点，返回值
            return node.value
        
        if x[node.feature_index] <= node.threshold:
            return self._predict_tree(x, node.left)
        else:
            return self._predict_tree(x, node.right)

# 示例用法
tree = DecisionTree(criterion='id3')
tree.fit(X_train.to_numpy(), y_train.to_numpy())

# 计算准确率
def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred)

# 使用测试数据进行预测
predictions = tree.predict(X_test.to_numpy())

# 计算准确率
accuracy = accuracy_score(y_test.to_numpy(), predictions)
print("准确率:", accuracy)

Accuracy: 0.9333333333333333


In [31]:
class DecisionTreeC45(DecisionTree):
    def _calculate_split_info(self, X, y, feature_index):
        # 计算特征的分割信息
        classes, counts = np.unique(X[:, feature_index], return_counts=True)
        probabilities = counts / len(y)
        split_info = -np.sum(probabilities * np.log2(probabilities))
        return split_info
        
    def _calculate_gain_ratio(self, X, y, feature_index, threshold):
        # 计算增益比
        information_gain = self._calculate_information_gain(X, y, feature_index, threshold)
        split_info = self._calculate_split_info(X, y, feature_index)
        gain_ratio = information_gain / split_info if split_info != 0 else 0
        return gain_ratio
        
    def _find_best_split(self, X, y):
        # 寻找最佳分割点
        best_gain_ratio = -1
        best_feature_index = None
        best_threshold = None
        
        for feature_index in range(self.n_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                gain_ratio = self._calculate_gain_ratio(X, y, feature_index, threshold)
                if gain_ratio > best_gain_ratio:
                    best_gain_ratio = gain_ratio
                    best_feature_index = feature_index
                    best_threshold = threshold
        
        return best_feature_index, best_threshold

# C4.5决策树示例用法
tree_c45 = DecisionTreeC45()
tree_c45.fit(X_train.to_numpy(), y_train.to_numpy())
predictions_c45 = tree_c45.predict(X_test.to_numpy())

# 计算准确率
accuracy_c45 = accuracy_score(y_test.to_numpy(), predictions_c45)
print("准确率 (C4.5):", accuracy_c45)

Accuracy (C4.5): 0.8888888888888888


In [34]:
class DecisionTreeCART(DecisionTree):
    def _calculate_gini(self, y):
        # 计算基尼不纯度
        classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        gini = 1 - np.sum(probabilities ** 2)
        return gini
        
    def _calculate_gini_impurity(self, X, y, feature_index, threshold):
        # 根据阈值分割数据
        left_mask = X[:, feature_index] <= threshold
        right_mask = ~left_mask
        left_y, right_y = y[left_mask], y[right_mask]
        
        # 计算左右子节点的基尼不纯度
        left_gini = self._calculate_gini(left_y)
        right_gini = self._calculate_gini(right_y)
        
        # 计算基尼不纯度的加权平均值
        n = len(y)
        gini_impurity = (len(left_y) / n) * left_gini + (len(right_y) / n) * right_gini
        return gini_impurity
        
    def _find_best_split(self, X, y):
        # 寻找最佳分割点
        best_gini_impurity = float('inf')
        best_feature_index = None
        best_threshold = None
        
        for feature_index in range(self.n_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                gini_impurity = self._calculate_gini_impurity(X, y, feature_index, threshold)
                if gini_impurity < best_gini_impurity:
                    best_gini_impurity = gini_impurity
                    best_feature_index = feature_index
                    best_threshold = threshold
        
        return best_feature_index, best_threshold

# CART决策树示例用法
tree_cart = DecisionTreeCART()
tree_cart.fit(X_train.to_numpy(), y_train.to_numpy())
predictions_cart = tree_cart.predict(X_test.to_numpy())

# 计算准确率
accuracy_cart = accuracy_score(y_test.to_numpy(), predictions_cart)
print("准确率 (CART):", accuracy_cart)

准确率 (CART): 0.9111111111111111
