In [18]:
import pandas as pd

# 导入数据
# 构建数据集
df = pd.DataFrame(pd.read_csv("../Data/watermelon2.0.csv", encoding="ansi"))
df.drop(labels=["编号"], axis=1, inplace=True)  # 删除编号这一列，inplace=True表示直接在原对象修改
# 转化为列表
dataset = df.values.tolist()
# 第4，5，8，9，11，12，13行作为测试集
dataset_test = [dataset[i-1] for i in [4,5,8,9,11,12,13]]

# 其余作为训练集
dataset_train = [dataset[i-1] for i in range(len(dataset)) if i not in [4,5,8,9,11,12,13]]

# 打印数据集
for i in dataset:
    print(i)
    
# 属性
features = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']

['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是']
['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '是']
['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是']
['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '是']
['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是']
['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '是']
['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '是']
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '是']
['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '否']
['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '否']
['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '否']
['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '否']
['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '否']
['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '否']
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '否']
['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '否']
['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '否']


In [19]:
import numpy as np

# 计算某个特征下某个取值的基尼系数
def Gini_index(dataset):
    '''
    @brief:calculate the Gini index of a dataset
    @param dataset: the dataset to be calculated
    @return: the Gini index of the dataset
    '''
    num_samples  = len(dataset)
    if(num_samples == 0):
        return 0
    # 统计该取值下对应每个标签的数量
    label_count = {}
    for sample in dataset:
        if(sample[-1] not in label_count):
            label_count[sample[-1]] = 0
        label_count[sample[-1]] += 1
    
    # 计算Gini指数
    Gini = 1.0
    for i in label_count:
        Pk = label_count[i] / num_samples
        Gini -= Pk**2

    return Gini
    
    


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import operator

# 创建决策树（不含裁剪）
def createTree(dataset, features):
    '''
    @brief: create a decision tree by using the ID3 algorithm
    @param dataset: the dataset to be used for training
    @param features: the features to be used for training
    @return: the decision tree
    '''
    # 取出所有样本的标签
    classList = [example[-1] for example in dataset]
    # 如果所有样本的标签相同，则返回该标签
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    # 如果特征集为空，则返回出现次数最多的标签
    if len(dataset[0]) == 1:
        return majorityCnt(classList)

    # 选择最优特征进行数据集划分
    bestfeatureIndex, bestValue = chooseBestFeatureToSplit(dataset)
    bestFeatLabel = features[bestfeatureIndex]

    # 创建节点
    myTree = {bestFeatLabel: {}}
    # 使用副本避免修改原始列表
    subfeatures = features.copy()  
    # 连续特征
    if type(bestValue).__name__ == 'float':
        myTree[bestFeatLabel]['<=' + str(bestValue)] = createTree(splitDataSetByValue(dataset, bestfeatureIndex, bestValue, True), subfeatures)
        myTree[bestFeatLabel]['>' + str(bestValue)] = createTree(splitDataSetByValue(dataset, bestfeatureIndex, bestValue, False), subfeatures)
    # 离散特征
    else:
        # 去除当前特征
        del subfeatures[bestfeatureIndex] 
        # 取出当前特征的取值
        featValue = [example[bestfeatureIndex] for example in dataset]
        uniqueVals = set(featValue)
        # 递归每一个特征值
        for value in uniqueVals:
            myTree[bestFeatLabel][value] = createTree(splitDataSet(dataset, bestfeatureIndex, value), subfeatures)

    return myTree

#创建决策树(预裁剪)
def createTree_prepruning(dataset, features, dataset_test):
    '''
    @brief: create a decision tree by using the ID3 algorithm
    @param dataset: the dataset to be used for training
    @param features: the features to be used for training
    @return: the decision tree
    '''
    # 取出所有样本的标签
    classList = [example[-1] for example in dataset]
    # 如果所有样本的标签相同，则返回该标签
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    # 如果特征集为空，则返回出现次数最多的标签
    if len(dataset[0]) == 1:
        return majorityCnt(classList)
    
    # 计算不分裂时的准确率（叶节点多数类）
    majority_class = majorityCnt(classList)
    accuracy_before = sum(1 for ex in dataset_test if ex[-1] == majority_class) / len(dataset_test)

    # 选择最优特征进行数据集划分
    bestfeatureIndex, bestValue = chooseBestFeatureToSplit(dataset)
    bestFeatLabel = features[bestfeatureIndex]

    # 创建节点
    myTree = {bestFeatLabel: {}}
    # 使用副本避免修改原始列表
    subfeatures = features.copy()  
    # 连续特征
    if type(bestValue).__name__ == 'float':
        myTree[bestFeatLabel]['<=' + str(bestValue)] = createTree_prepruning(splitDataSetByValue(dataset, bestfeatureIndex, bestValue, True), subfeatures,dataset_test)
        myTree[bestFeatLabel]['>' + str(bestValue)] = createTree_prepruning(splitDataSetByValue(dataset, bestfeatureIndex, bestValue, False), subfeatures,dataset_test)
    # 离散特征
    else:
        # 去除当前特征
        del subfeatures[bestfeatureIndex] 
        # 取出当前特征的取值
        featValue = [example[bestfeatureIndex] for example in dataset]
        uniqueVals = set(featValue)
        # 递归每一个特征值
        for value in uniqueVals:
            myTree[bestFeatLabel][value] = createTree_prepruning(splitDataSet(dataset, bestfeatureIndex, value), subfeatures,dataset_test)
    # 计算分裂后的准确率
    accuracy_after = calculateAccuracy(myTree, dataset_test)
    if accuracy_after > accuracy_before:
        return myTree
    return majority_class
    
def postPrune(tree, train_data, test_data, features):
    '''
    @brief: post-prune the decision tree
    @param tree: the decision tree to be pruned
    @param train_data: the training dataset 
    @param test_data: the testing dataset
    @param features: the features to be used for training
    @return: the pruned decision tree
    '''
    if not isinstance(tree, dict):  # 到达叶节点，无需剪枝
        return tree
    
    # 深度优先遍历子树
    current_feat = list(tree.keys())[0]
    sub_tree = tree[current_feat]
    feat_idx = features.index(current_feat)
    
    # 递归剪枝所有子节点
    for key in list(sub_tree.keys()):
        # 处理连续特征（如 "<=0.5"）
        if isinstance(key, str) and ('<=' in key or '>' in key):
            # 提取阈值
            threshold = float(key.split('=')[1])
            # 划分训练集用于计算多数类
            subset_train = [ex for ex in train_data if (ex[feat_idx] <= threshold and key.startswith('<=')) or (ex[feat_idx] > threshold and key.startswith('>'))]
            # 递归剪枝子节点
            sub_tree[key] = postPrune(sub_tree[key], subset_train, test_data, features)
        # 处理离散特征（如 "硬滑"）
        else:
            subset_train = [ex for ex in train_data if ex[feat_idx] == key]
            sub_tree[key] = postPrune(sub_tree[key], subset_train, test_data, features)
    
    # 尝试剪枝当前节点
    accuracy_before = calculateAccuracy(tree, test_data)
    
    # 计算当前节点下训练集的多数类
    class_list = [ex[-1] for ex in train_data]
    majority_class = majorityCnt(class_list)
    
    # 计算替换为叶节点后的准确率
    accuracy_after = sum(1 for ex in test_data if ex[-1] == majority_class) / len(test_data) if test_data else 0
    
    # 若剪枝后准确率不下降，则剪枝
    if accuracy_after >= accuracy_before:
        return majority_class
    else:
        return tree
    
# 计算类别中出现次数最多的元素
def majorityCnt(classList):
    # 创建一个字典{类标签:出现次数}
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys(): classCount[vote] = 0
        classCount[vote] += 1
    # 降序排序[(类标签,出现次数),(),()]
    sortedclassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedclassCount[0][0]

# 选择最优特征进行数据集划分
def chooseBestFeatureToSplit(dataset):
    numFeatures = len(dataset[0]) - 1
    minGini = 1
    bestFeature = -1
    bestValue = 0
    # 遍历所有特征
    for i in range(numFeatures):
        # 取出第i个特征
        featList = [example[i] for example in dataset]
        # 连续特征
        if type(featList[0]).__name__ == 'float':
            # 排序
            sortedfeatList = sorted(featList)
            splitList = []
            # 计算切分点
            for j in range(len(sortedfeatList) - 1):
                splitVal = (sortedfeatList[j] + sortedfeatList[j + 1]) / 2.0
                splitList.append(splitVal)
            # 计算Gini指数
            for val in set(splitList):
                Gini = 0
                subDataSet1 = splitDataSetByValue(dataset, i, val, True)
                subDataSet2 = splitDataSetByValue(dataset, i, val, False)
                prob1 = len(subDataSet1) / float(len(dataset))
                Gini1 = Gini_index(subDataSet1)
                prob2 = len(subDataSet2) / float(len(dataset))
                Gini2 = Gini_index(subDataSet2)
                Gini += prob1 * Gini1 + prob2 * Gini2
                
                if (Gini < minGini):
                    minGini = Gini
                    bestFeature = i
                    bestValue = val
        # 离散特征
        else:
            uniqueVals = set(featList)
            Gini = 0
            # 遍历所有取值
            for val in uniqueVals:
                subDataSet = splitDataSet(dataset, i, val)
                prob = len(subDataSet) / float(len(dataset))
                Gini += prob * Gini_index(subDataSet)
            
            if (Gini < minGini):
                minGini = Gini
                bestFeature = i
                bestValue = None
    return bestFeature, bestValue

# 根据特征值划分数据集
def splitDataSet(dataset, axis, val):
    retDataSet = []
    for featVec in dataset:
        if featVec[axis] == val:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis + 1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

# 根据特征值和方向划分数据集
def splitDataSetByValue(dataset, axis, val, direction):
    retDataSet = []
    for featVec in dataset:
        if direction:
            if featVec[axis] <= val:
                reducedFeatVec = featVec[:axis]
                reducedFeatVec.extend(featVec[axis + 1:])
                retDataSet.append(reducedFeatVec)
        else:
            if featVec[axis] > val:
                reducedFeatVec = featVec[:axis]
                reducedFeatVec.extend(featVec[axis + 1:])
                retDataSet.append(reducedFeatVec)
    return retDataSet

def predict(inputTree, features, testVec):
    '''
    @brief: predict the label of a test vector using a decision tree
    @param inputTree: the decision tree to be used for prediction
    @param features: the features to be used for training
    @param testVec: the test vector to be predicted
    @return: the predicted label of the test vector
    '''
    # 若为叶节点，则返回标签
    if not isinstance(inputTree, dict):
        return inputTree
    
    # 提取当前节点
    firstStr = list(inputTree.keys())[0]
    # 提取当前节点下的子节点
    secondDict = inputTree[firstStr]
    # 获取当前节点的特征标签
    featureIndex = features.index(firstStr)

    for key in secondDict.keys():
        # 处理连续特征（如 "<=0.5"）
        if type(key).__name__ == 'str' and ('<=' in key or '>' in key):
            # 移除字符串中的符号，取出阈值
            threshold = float(key.replace('<=', '').replace('>', ''))
            # 当前特征值小于等于阈值，则进入左子树
            if key.startswith('<=') and testVec[featureIndex] <= threshold:
                childTree = secondDict[key]
                # 判断是否为内部节点，若是，则表示不是叶子节点，继续递归
                if isinstance(childTree, dict):
                    return predict(childTree, features, testVec)
                else:
                    return childTree
            elif key.startswith('>') and testVec[featureIndex] > threshold:
                childTree = secondDict[key]
                if isinstance(childTree, dict):
                    return predict(childTree, features, testVec)
                else:
                    return childTree
        # 处理离散特征（如 "硬滑"）
        else:
            if testVec[featureIndex] == key:
                childTree = secondDict[key]
                if isinstance(childTree, dict):
                    return predict(childTree, features, testVec)
                else:
                    return childTree
    # 若未匹配任何分支
    return "未知类别"  

# 计算准确率
def calculateAccuracy(inputTree,testData):
    current_accuracy = 0
    for sample in testData:
        # 去除标签
        testVec = sample[:-1]
        # 预测
        prediction = predict(inputTree, features, testVec)
        # 计算准确率
        if prediction == sample[-1]:
            current_accuracy += 1
    return current_accuracy / float(len(testData))



In [21]:
# 创建决策树
myWholeTree = createTree(dataset_train,features)
# 打印决策树
print("完整决策树:")
print(myWholeTree)
# 计算准确率
accuracy = calculateAccuracy(myWholeTree,dataset_test)
print("完整决策树的准确率:",accuracy)

myPrepruningTree = createTree_prepruning(dataset_train,features,dataset_test)
print("预剪枝决策树:")
print(myPrepruningTree)
accuracy = calculateAccuracy(myPrepruningTree,dataset_test)
print("预剪枝决策树的准确率:",accuracy) 

myPostpruningTree = postPrune(myWholeTree,dataset_train,dataset_test,features)
print("后剪枝决策树:")
print(myPostpruningTree)
accuracy = calculateAccuracy(myPostpruningTree,dataset_test)
print("后剪枝决策树的准确率:",accuracy)



完整决策树:
{'色泽': {'浅白': '否', '青绿': {'敲声': {'沉闷': '否', '浊响': '是', '清脆': '否'}}, '乌黑': {'根蒂': {'稍蜷': {'纹理': {'清晰': '否', '稍糊': '是'}}, '蜷缩': '是'}}}}
完整决策树的准确率: 0.2857142857142857
预剪枝决策树:
否
预剪枝决策树的准确率: 0.5714285714285714
后剪枝决策树:
否
后剪枝决策树的准确率: 0.5714285714285714
