In [1]:
%matplotlib inline

# 	3. 决策树实验

## 课程目标
1. 基于理论进一步掌握决策树学习算法。
1. 编程实现基于信息增益、增益率和基尼指数进行划分选择的未剪枝决策树学习算法。

## 课程要求
1. 将本次课程内容写在实验报告上。


## 内容1
编程实现基于信息增益进行划分选择的决策树算法，并为书中表4.2中的训练数据生成一棵决策树。

提示：
1. 树的结点可能需要定义一个类TreeNode，包含了父节点、子结点、属性选择，训练数据等有效信息；
2. 分别定义函数计算结点的信息熵、信息增益。

实验要求：
1. 读懂下面的代码，并根据自己的理解为下面的代码写上注释；
1. 将下面代码中的数据集DataSet读取改为从文件“3.0.csv”文件中读取；
1. 将以下代码缺失的部分补充完成；
4. 输出的结果如代码下一致；

In [6]:
import math
# import utils
import pandas as pd
import numpy as np


DataSet = [
        # 1
        ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
        # 2
        ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
        # 3
        ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
        # 6
        ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '好瓜'],
        # 7
        ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '好瓜'],
        # ----------------------------------------------------
        # 10
        ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '坏瓜'],
        # 14
        ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '坏瓜'],
        # 15
        ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '坏瓜'],
        # 16
        ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '坏瓜'],
        # 17
        ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜']
    ]

wm_data = pd.read_csv(r"C:\Users\lenovo\Desktop\3.0.csv")
wm_data = wm_data.replace({"好瓜":{"是":'好瓜', '否':"坏瓜"}})
wm_data = wm_data.drop(['编号', '含糖率', '密度'], axis=1).values


Attributes = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']


class TreeNode:
    """
    决策树结点类
    """
    current_index = 0
 
    def __init__(self, parent=None, attr_name=None, children=None, judge=None,  data_index=None,
                 attr_value=None, rest_attribute=None):
        """
        决策树结点类初始化方法
        :param parent: 父节点
        """
        self.parent = parent  # 父节点，根节点的父节点为 None
        self.attribute_name = attr_name  # 本节点上进行划分的属性名
        self.attribute_value = attr_value  # 本节点上划分属性的值，是与父节点的划分属性名相对应的
        self.children = children  # 孩子结点列表
        self.judge = judge  # 叶子结点判断最终的分类（好瓜？坏瓜？）
        self.data_index = data_index  # 对应训练数据集的训练索引号
        self.index = TreeNode.current_index  # 当前结点的索引号，方便输出时查看
        self.rest_attribute = rest_attribute  # 尚未使用的属性列表
        TreeNode.current_index += 1
 
    def to_string(self):
        """
        用一个字符串来描述当前结点信息
        一个普通结点的描述--------------------------------------------
        current index : x;
        parent index : xx;
        parent node's attribute(父亲结点的属性划分（比方说色泽）)：青黑（此结点的色泽）
        data：包含的训练数据
        selected attribute(选择的属性)：xx
        children(子节点)：x x x 
        """
        this_string = 'current index : ' + str(self.index) + ";\n"
        if not (self.parent is None):
            parent_node = self.parent
            this_string = this_string + 'parent index : ' + str(parent_node.index) + ";\n"
            this_string = this_string + str(parent_node.attribute_name) + " : " + str(self.attribute_value) + ";\n"
        this_string = this_string + "data : " + str(self.data_index) + ";\n"
        if not(self.children is None):
            this_string = this_string + 'selected attribute is : ' + str(self.attribute_name) + ";\n"
            child_list = []
            for child in self.children:
                child_list.append(child.index)
            this_string = this_string + 'children : ' + str(child_list)
        if not (self.judge is None):
            this_string = this_string + 'label : ' + self.judge
        return this_string
 


    def ent(labels):
        """
        样本集合的信息熵
        :param labels: 样本集合中数据的类别标签
        :return:
        """
        label_name = []
        label_count = {}

        for label in labels:
            label_count[label] = label_count.get(label, 0) + 1
            entropy = 0

        for label in label_count:
            prob = label_count[label] / len(labels)
            entropy -= prob * math.log(prob, 2)
        
        
        return entropy

    def gain(attribute, labels):
        """
        计算信息增益
        :param attribute: 集合中所有样本该属性的值列表（例如青绿，乌黑，浅白）
        :param labels: 集合中样本的数据标签
        :return:
        """

        info_gain = TreeNode.ent(labels)
        n = len(labels)
        attr_count = {}
        for i in range(len(attribute)):
            attr = attribute[i]
            label = labels[i]
            attr_count[attr] = attr_count.get(attr, {})
            attr_count[attr][label] = attr_count[attr].get(label, 0) + 1

            total_entropy = TreeNode.ent(labels)
            weighted_entropy = 0
        for attr in attr_count:
            attr_prob = len(attr_count[attr]) / n
            entropy = 0
            for label in attr_count[attr]:
                p = attr_count[attr][label] / len(attr_count[attr])
                entropy -= p * math.log(p, 2)
            weighted_entropy += attr_prob * entropy

            info_gain = total_entropy - weighted_entropy
            
        print('信息增益：', info_gain)
        return info_gain



    def finish_node(current_node, data, label):
        """
        完成当前结点的后续计算，包括选择属性，划分子节点等
        :param current_node: 当前的结点
        :param data: 数据集
        :param label: 数据集的 label
        :param rest_title: 剩余的可用属性名
        :return:
        """
        # n = len(label)

        # 1.判断当前结点的数据是否属于同一类，如果是，直接标记为叶子结点并返回
        one_class = True

        this_data_index = current_node.data_index #训练数据在训练集中的索引序号

        for i in this_data_index:
            for j in this_data_index:
                if label[i] != label[j]:
                    one_class = False
                    break
            if not one_class:
                break
        if one_class:
            current_node.judge = label[this_data_index[0]]
            return

        # 2. 如果当前结点的数据不是同一类，但候选属性为空
        rest_title = current_node.rest_attribute  # 候选属性
        if len(rest_title) == 0:  # 如果候选属性为空，则是个叶子结点。需要选择最多的那个类作为该结点的类
            label_count = {}
            temp_data = current_node.data_index 
            for index in temp_data:
                if label_count.__contains__(label[index]):
                    label_count[label[index]] += 1
                else:
                    label_count[label[index]] = 1
            final_label = max(label_count)
            current_node.judge = final_label
            return

        # 3. 如果剩余有多个属性
        title_gain = {}  # 记录每个属性的信息增益
        for title in rest_title: #挑选属性
            attr_values = []
            current_label = []
            for index in current_node.data_index:
                this_data = data[index]
                attr_values.append(this_data[title]) # 记录此样本的属性值，直至结点所有样本的属性都记录
                current_label.append(label[index])   # 记录此样本的标签，直至结点所有样本的标签都记录
            temp_data = data[0] # temp_data中是第一个样本数据包含特征属性
            this_gain = TreeNode.gain(attr_values, current_label)  
            title_gain[title] = this_gain

        best_attr = max(title_gain, key=title_gain.get)  # 信息增益最大的属性名
        current_node.attribute_name = best_attr
        rest_title.remove(best_attr)

        # a_data = data[0] # 记录第一个数据的特征属性有什么

        # 属性划分
        best_titlevalue_dict = {}  # key是属性值的取值，value是个list记录所包含的样本序号
        for index in current_node.data_index:
            this_data = data[index] #取出该序号下的数据 --
            if best_titlevalue_dict.__contains__(this_data[best_attr]):
                temp_list = best_titlevalue_dict[this_data[best_attr]]
                temp_list.append(index)
            else:                                 #新属性值出现
                temp_list = [index]
                best_titlevalue_dict[this_data[best_attr]] = temp_list

        children_list = []
        for key, index_list in best_titlevalue_dict.items():
            a_child = TreeNode(parent=current_node, data_index=index_list, attr_value=key, rest_attribute=rest_title.copy())
            children_list.append(a_child)
        current_node.children = children_list

        # print(current_node.to_string())
        for child in current_node.children:  # 递归
            TreeNode.finish_node(child, data, label)
            
        
def id3_tree(Data, title, label):
    """
    id3方法构造决策树，使用的标准是信息增益（信息熵）
    :param Data: 数据集，每个样本是一个 dict(属性名：属性值)，整个 Data 是个大的 list
    :param title: 每个属性的名字，如 色泽、根蒂等
    :param label: 存储的是每个样本的类别
    :return:
    """
    n = len(Data)
    root_data = []
    for i in range(0, n):
        root_data.append(i)
        
    root_node = TreeNode(data_index=root_data, rest_attribute=title.copy())
    TreeNode.finish_node(root_node, Data, label)
    
    return root_node
 
    
def print_tree(root=TreeNode()):
    """
    打印输出一颗树
    :param root: 根节点
    :return:
    """
    node_list = [root]
    while(len(node_list)>0):
        current_node = node_list[0]
        print('--------------------------------------------')
        print(current_node.to_string())
        print('--------------------------------------------')
        children_list = current_node.children
        if not (children_list is None):
            for child in children_list:
                node_list.append(child)
        node_list.remove(current_node)
        
        
        
data = []  # 存放数据
label = []  # 存放标签
for sample in wm_data:
    a_dict = {}
    dim = len(sample) - 1
    for i in range(0, dim):
        a_dict[Attributes[i]] = sample[i]
    data.append(a_dict)
    label.append(sample[dim])
    
 

decision_tree = id3_tree(data, Attributes, label)
print_tree(decision_tree)

信息增益： 1.6157246054471703
信息增益： 1.94570345688463
信息增益： 2.013070929103845
信息增益： 2.1978773673436267
信息增益： 2.063350515708159
信息增益： 2.2195282822995472
信息增益： 1.1258145836939115
信息增益： 1.6137106647166901
信息增益： 1.4795739585136225
信息增益： 3.125814583693911
信息增益： 1.6341367062030678
信息增益： 0.17095059445466854
信息增益： 0.7709505944546686
信息增益： 0.9709505944546686
信息增益： 0.7709505944546686
信息增益： 1.1709505944546685
信息增益： 0.2516291673878229
信息增益： 0.5849625007211563
信息增益： 0.5849625007211563
信息增益： 0.2516291673878229
信息增益： 0.2516291673878229
信息增益： 0.5849625007211563
信息增益： 0.2516291673878229
信息增益： 0.2516291673878229
信息增益： 0.2516291673878229
信息增益： 1.0
--------------------------------------------
current index : 1;
data : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
selected attribute is : 触感;
children : [2, 3]
--------------------------------------------
--------------------------------------------
current index : 2;
parent index : 1;
触感 : 硬滑;
data : [0, 1, 2, 3, 4, 7, 8, 10, 12, 13, 15, 16];
select

## 内容2

结合搜索引擎阅读并理解以下代码。

并回答：以下代码和课上所授决策树内容有什么区别？

**注意，请将以下代码中random_state替换为自己学号的后五位。**

In [8]:
%reset -f
# 导入所需的库
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# 加载鸢尾花数据集
iris = load_iris()
X = iris.data # 特征矩阵
y = iris.target # 类别向量
feature_names = iris.feature_names # 特征名称

# 划分训练集和测试集
random_state = 60090
np.random.seed(random_state) # 设置随机种子，保证可复现性
indices = np.random.permutation(len(X)) # 生成随机索引   # np.random.permutation生成随机排序序列,多维数据的话，只是对第一维进行了操作
split = int(len(X) * 0.8) # 设置划分比例为80%
X_train = X[indices[:split]] # 训练集特征
y_train = y[indices[:split]] # 训练集类别
X_test = X[indices[split:]] # 测试集特征
y_test = y[indices[split:]] # 测试集类别

# 定义计算基尼指数的函数
def gini(y):
    """
    输入：类别向量y
    输出：基尼指数
    """
    # np.unique是去重，并按从小到大的顺序输出列表元素；return_counts=True,返回去重数组元素出现的次数；得到freqs即每个类别的频率，表示为所有类比构成的一个数组
    unique, counts = np.unique(y, return_counts=True)
    freqs = counts / len(y)  
    # 根据基尼公式--由于freqs存储当前节点处每个待分类属性的频率，使用np.sum求出在该样本中的抽取两个数，类别不一样的概率
    gini = 1 - np.sum(freqs**2)
    return gini

# 定义根据特征和阈值划分数据集的函数
def split_dataset(X, y, feature_index, threshold):
    """
    输入：特征矩阵X，类别向量y，特征索引feature_index，阈值threshold
    输出：划分后的左右子集(X_left, y_left), (X_right, y_right)
    """
    # 根据特征和阈值对数据进行二分划分
    left_indices = X[:, feature_index] <= threshold # 左子集的索引
    right_indices = X[:, feature_index] > threshold # 右子集的索引
    X_left = X[left_indices] # 左子集特征
    y_left = y[left_indices] # 左子集类别
    X_right = X[right_indices] # 右子集特征
    y_right = y[right_indices] # 右子集类别
    return (X_left, y_left), (X_right, y_right)

# 定义寻找最佳划分特征和阈值的函数
def best_split(X, y):
    """
    输入：特征矩阵X，类别向量y
    输出：最佳划分特征best_feature，最佳划分阈值best_threshold，最佳划分基尼指数best_gini
    """
    # 初始化最佳划分参数
    best_feature = None 
    best_threshold = None 
    best_gini = 1 # 最大可能的基尼指数为1
    
    n_features = X.shape[1] # 特征的数量
    
    for feature_index in range(n_features): # 遍历每个特征
        feature_values = X[:, feature_index] # 获取该特征的所有取值
        possible_thresholds = np.unique(feature_values) # 获取该特征的所有可能的阈值
        
        for threshold in possible_thresholds: # 遍历每个阈值
            # 根据该特征和阈值划分数据集为左右两个子集
            (X_left, y_left), (X_right, y_right) = split_dataset(X, y, feature_index, threshold)
            if len(y_left) == 0 or len(y_right) == 0: # 如果某个子集为空，则跳过该划分
                continue
            # 计算左右子集的权重和基尼指数
            weight_left = len(y_left) / len(y)
            weight_right = len(y_right) / len(y)
            gini_left = gini(y_left)
            gini_right = gini(y_right)
            # 计算该划分的加权基尼指数
            weighted_gini = weight_left * gini_left + weight_right * gini_right
            # 如果该划分的基尼指数小于当前最佳划分的基尼指数，则更新最佳划分参数
            if weighted_gini < best_gini:
                best_feature = feature_index
                best_threshold = threshold
                best_gini = weighted_gini
    
    return best_feature, best_threshold, best_gini


# 定义构建决策树的函数
def build_tree(X, y, max_depth=5, min_samples_split=2):
    """
    输入：特征矩阵X，类别向量y，最大深度max_depth，最小划分样本数min_samples_split
    输出：决策树，以字典的形式表示
    """
    # 创建一个空字典，用于存储决策树的信息
    tree = {}
    
    # 如果节点中的数据属于同一类别，则将节点标记为叶子节点，并返回其类别标签
    if len(np.unique(y)) == 1:
        tree["type"] = "leaf"
        tree["class"] = y[0]
        return tree
    
    # 如果节点的深度达到了最大深度，则将节点标记为叶子节点，并返回其数据中出现最多的类别标签
    if max_depth == 0:
        tree["type"] = "leaf"
        tree["class"] = np.bincount(y).argmax()
        return tree
    
    # 如果节点的数据量小于最小划分样本数，则将节点标记为叶子节点，并返回其数据中出现最多的类别标签
    if len(y) < min_samples_split:
        tree["type"] = "leaf"
        tree["class"] = np.bincount(y).argmax()
        return tree
    
    # 否则，寻找最佳划分特征和阈值，并将节点分为左右两个子节点
    best_feature, best_threshold, best_gini = best_split(X, y)
    
    # 如果没有找到合适的划分，则将节点标记为叶子节点，并返回其数据中出现最多的类别标签
    if best_feature is None or best_threshold is None:
        tree["type"] = "leaf"
        tree["class"] = np.bincount(y).argmax()
        return tree
    
    # 否则，根据最佳划分特征和阈值划分数据集为左右两个子集
    (X_left, y_left), (X_right, y_right) = split_dataset(X, y, best_feature, best_threshold)
    
    # 将节点标记为内部节点，并存储其划分特征和阈值
    tree["type"] = "internal"
    tree["feature"] = feature_names[best_feature]
    tree["threshold"] = best_threshold
    
    # 递归地对左右子节点进行同样的操作，减少最大深度
    tree["left"] = build_tree(X_left, y_left, max_depth-1, min_samples_split)
    tree["right"] = build_tree(X_right, y_right, max_depth-1, min_samples_split)
    
    return tree


# 定义根据决策树对新数据进行预测的函数
def predict(tree, x):
    """
    输入：决策树tree，单个样本x（一维数组）
    输出：预测结果y_pred（整数）
    """
    
    # 如果当前节点是叶子节点，则返回其类别标签作为预测结果
    if tree["type"] == "leaf":
        return tree["class"]
    
    # 否则，根据当前节点的划分特征和阈值将样本分配到左右子节点中
    feature_index = feature_names.index(tree["feature"]) # 获取划分特征的索引
    threshold = tree["threshold"] # 获取划分阈值
    
    if x[feature_index] <= threshold: # 如果样本的特征值小于等于阈值，则分配到左子节点
        return predict(tree["left"], x)
    else: # 否则，分配到右子节点
        return predict(tree["right"], x)

# 使用训练数据集构建决策树
tree = build_tree(X_train, y_train, max_depth=3, min_samples_split=5)

# 使用测试数据集进行预测，并计算准确率
y_pred = [predict(tree, x) for x in X_test]
accuracy = accuracy_score(y_test, y_pred)
print("The accuracy of the decision tree is:", accuracy)
print(tree)

The accuracy of the decision tree is: 0.9333333333333333
{'type': 'internal', 'feature': 'petal length (cm)', 'threshold': 1.9, 'left': {'type': 'leaf', 'class': 0}, 'right': {'type': 'internal', 'feature': 'petal width (cm)', 'threshold': 1.6, 'left': {'type': 'internal', 'feature': 'petal length (cm)', 'threshold': 4.9, 'left': {'type': 'leaf', 'class': 1}, 'right': {'type': 'leaf', 'class': 2}}, 'right': {'type': 'leaf', 'class': 2}}}


## 内容3 在理解内容1和2的基础上，不借助sklearn编写程序，编程实现基于增益率和基尼指数进行划分选择的决策树算法，并为书中表4.2中的训练数据生成一棵决策树。

In [28]:
# 此处完成内容3的代码实现
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.model_selection import train_test_split

wm_data = pd.read_csv(r"C:\Users\lenovo\Desktop\3.0.csv")
wm_data = wm_data.drop(['编号'], axis=1).values



Attributes = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '含糖率', '密度']


def Gini(y):
    unique, counts = np.unique(y, return_counts=True)
    freqs = counts / len(y)
    
    gini = 1 - np.sum(freqs**2)
    
    return gini


def split_selection(attribute, y, split_type):
    if split_type == 1:
        return Gini(y)
    
    if split_type == 2:
        return gain(attribute, y)
        
    if split_type == 3:
        return gain_rate(attribute, y)
    
    
def Ent(labels):
    label_name = []
    label_count = {}

    for label in labels:
        label_count[label] = label_count.get(label, 0) + 1
        entropy = 0

    for label in label_count:
        prob = label_count[label] / len(labels)
        entropy -= prob * np.log2(prob)
    

    return entropy

def gain(attribute, labels):
    info_gain = Ent(labels)
    s = set(attribute)
    gain = 0
    atr_index = list(enumerate(attribute))
    
    
    for atr in s:
        p_atr = attribute.count(atr) / len(attribute)
        label_atr = []
        for j in range(len(labels)):
            if atr_index[j][1] == atr:
                label_atr.append(labels[j])
                
        ent_atr = Ent(label_atr)
        gain += p_atr*ent_atr
    

    return info_gain - gain

def gain_rate(attribute, labels):
    info_gain = Ent(labels)
    s = set(attribute)
    gain = 0
    IV_atr = 0
    atr_index = list(enumerate(attribute))
    
    
    for atr in s:
        p_atr = attribute.counts(atr) / len(attribute)
        label_atr = []
        for j in range(len(labels)):
            if atr_index[j][1] == atr:
                label_atr.append(labels[j])
                
        ent_atr = Ent(label_atr)
        gain += p_atr*ent_atr
        IV_atr += p_atr*np.log2(p_atr)
    
    gain_ratio = (info_gain - gain) / IV_atr
    
    return gain_ratio



    
class TreeNode:
    current_index = 0
    def __init__(self, parent=None, attr_name=None, children=None, judge=None,  data_index=None,
                 attr_value=None, rest_attribute=None):
        """
        决策树结点类初始化方法
        :param parent: 父节点
        """
        self.parent = parent  
        self.attribute_name = attr_name 
        self.attribute_value = attr_value  
        self.children = children 
        self.judge = judge  
        self.data_index = data_index  
        self.index = TreeNode.current_index  
        self.rest_attribute = rest_attribute  
        TreeNode.current_index += 1
 
    def to_string(self):
        """
        用一个字符串来描述当前结点信息
        一个普通结点的描述--------------------------------------------
        current index : x;
        parent index : xx;
        parent node's attribute(父亲结点的属性划分（比方说色泽）)：青黑（此结点的色泽）
        data：包含的训练数据
        selected attribute(选择的属性)：xx
        children(子节点)：x x x 
        """
        this_string = 'current index : ' + str(self.index) + ";\n"
        if not (self.parent is None):
            parent_node = self.parent
            this_string = this_string + 'parent index : ' + str(parent_node.index) + ";\n"
            this_string = this_string + str(parent_node.attribute_name) + " : " + str(self.attribute_value) + ";\n"
        this_string = this_string + "data : " + str(self.data_index) + ";\n"
        if not(self.children is None):
            this_string = this_string + 'selected attribute is : ' + str(self.attribute_name) + ";\n"
            child_list = []
            for child in self.children:
                child_list.append(child.index)
            this_string = this_string + 'children : ' + str(child_list)
        if not (self.judge is None):
            this_string = this_string + 'label : ' + self.judge
        return this_string
    
    

def finish_node(current_node, X, label, split_type):
    one_class = True
 
    this_data_index = current_node.data_index
    for i in this_data_index:
        for j in this_data_index:
            if label[i] != label[j]:
                one_class = False
                break
        if not one_class:
            break
    if one_class:
        
        current_node.judge = label[this_data_index[0]]
        return
    
    rest_title = current_node.rest_attribute 
    if len(rest_title) == 0: 
        label_count = {}
        temp_data = current_node.data_index 
        for index in temp_data:
            if label[index] in label_count:
                label_count[label[index]] += 1
            else:
                label_count[label[index]] = 1
        final_label = max(label_count)
        current_node.judge = final_label
        return
    
    
    attribute_eva = {}  # 记录每个属性的评价(1.信息增益 2.增益率 3.基尼指数)
    for attr in rest_title: #挑选属性
        attr_values = []
        current_label = []
        for index in current_node.data_index:
            this_data = X[index]
            attr_values.append(this_data[attr]) 
            current_label.append(label[index])   
        
        
        if attr not in ['密度', '含糖率']:
            this_gain = split_selection(attr_values, current_label, split_type)  
            attribute_eva[attr] = this_gain
        else:
            x_feature = sorted(set(attr_values)) 
            feature_values = []
            for i in range(len(x_feature) - 1):  # 计算划分点
                feature_values.append((float(x_feature[i]) + float(
                    x_feature[i + 1])) / 2)

            
            for threshold in feature_values:
                attr_left = []
                label_left = []
                label_right = []
                attr_right = []
                min_entropy = 100
                best_threshold = 0
                for i in range(len(attr_values)):
                    if attr_values[i] < threshold:
                        attr_left.append(attr_values[i])
                        label_left.append(current_label[i])
                    else:
                        attr_right.append(attr_values[i])
                        label_right.append(current_label[i])
                
                prob_left = len(attr_left) / len(attr_values)
                prob_right = len(attr_right) / len(attr_values)
                entropy = prob_left * Ent(label_left) + prob_right * Ent(label_right)
                
                if entropy < min_entropy:
                    min_entropy = entropy
                    best_threshold = threshold
                    
            this_gain = Ent(current_label) - min_entropy
            attribute_eva[attr] = this_gain
                
    print(attribute_eva)
    best_attr = max(attribute_eva, key=attribute_eva.get)  
    current_node.attribute_name = best_attr
    rest_title.remove(best_attr)
 
    

    # 属性划分
    best_titlevalue_dict = {}  # key是属性值的取值，value是个list记录所包含的样本序号
    for index in current_node.data_index:
        this_data = X[index] 
        if this_data[best_attr] in best_titlevalue_dict:
            temp_list = best_titlevalue_dict[this_data[best_attr]]
            temp_list.append(index)
        else:                                
            temp_list = [index]
            best_titlevalue_dict[this_data[best_attr]] = temp_list
 
    children_list = []
    for key, index_list in best_titlevalue_dict.items():
        a_child = TreeNode(parent=current_node, data_index=index_list, attr_value=key, rest_attribute=rest_title.copy())
        children_list.append(a_child)
    current_node.children = children_list
 
   
    for child in current_node.children:  
        finish_node(child, X, label, split_type)

def id3_tree(X, title, label, split_type):
    n = len(X)
    root_data = []
    for i in range(0, n):
        root_data.append(i)
        
    root_node = TreeNode(data_index=root_data, rest_attribute=title.copy())
    finish_node(root_node, X, label, split_type)
    
    return root_node
 
    
def print_tree(root=TreeNode()):
    """
    打印输出一颗树
    :param root: 根节点
    :return:
    """
    node_list = [root]
    while(len(node_list)>0):
        current_node = node_list[0]
        print('--------------------------------------------')
        print(current_node.to_string())
        print('--------------------------------------------')
        children_list = current_node.children
        if not (children_list is None):
            for child in children_list:
                node_list.append(child)
        node_list.remove(current_node)
        
        
        
data = [] 
label = []  
for sample in wm_data:
    a_dict = {}
    dim = len(sample) - 1
    for i in range(0, dim):
        a_dict[Attributes[i]] = sample[i]
    data.append(a_dict)
    label.append(sample[dim])

decision_tree = id3_tree(data, Attributes, label, 1)
print_tree(decision_tree)

{'色泽': 0.4982698961937716, '根蒂': 0.4982698961937716, '敲声': 0.4982698961937716, '纹理': 0.4982698961937716, '脐部': 0.4982698961937716, '触感': 0.4982698961937716, '含糖率': 0.06696192680347068, '密度': 0.06696192680347068}
{'根蒂': 0.5, '敲声': 0.5, '纹理': 0.5, '脐部': 0.5, '触感': 0.5, '含糖率': 0.19087450462110944, '密度': 0.19087450462110944}
{'敲声': 0.4444444444444444, '纹理': 0.4444444444444444, '脐部': 0.4444444444444444, '触感': 0.4444444444444444, '含糖率': 0.9182958340544896, '密度': 0.2516291673878229}
{'敲声': 0.5, '纹理': 0.5, '脐部': 0.5, '触感': 0.5, '含糖率': 1.0, '密度': 1.0}
{'根蒂': 0.4444444444444444, '敲声': 0.4444444444444444, '纹理': 0.4444444444444444, '脐部': 0.4444444444444444, '触感': 0.4444444444444444, '含糖率': 0.109170338675599, '密度': 0.109170338675599}
{'敲声': 0.5, '纹理': 0.5, '脐部': 0.5, '触感': 0.5, '含糖率': 0.31127812445913283, '密度': 0.31127812445913283}
{'纹理': 0.4444444444444444, '脐部': 0.4444444444444444, '触感': 0.4444444444444444, '含糖率': 0.2516291673878229, '密度': 0.9182958340544896}
{'根蒂': 0.31999999999999984, '敲声': 0.3

In [34]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.model_selection import train_test_split

wm_data = pd.read_csv(r"C:\Users\lenovo\Desktop\3.0.csv")
wm_data = wm_data.drop(['编号'], axis=1).values



Attributes = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '含糖率', '密度']


def Gini(y):
    unique, counts = np.unique(y, return_counts=True)
    freqs = counts / len(y)
    
    gini = 1 - np.sum(freqs**2)
    
    return gini


def split_selection(attribute, y, split_type):
    if split_type == 1:
        return Gini(y)
    
    if split_type == 2:
        return gain(attribute, y)
        
    if split_type == 3:
        return gain_rate(attribute, y)
    
    
def Ent(labels):
    label_name = []
    label_count = {}

    for label in labels:
        label_count[label] = label_count.get(label, 0) + 1
        entropy = 0

    for label in label_count:
        prob = label_count[label] / len(labels)
        entropy -= prob * np.log2(prob)
        
    return entropy

def gain(attribute, labels):
    info_gain = Ent(labels)
    s = set(attribute)
    gain = 0
    atr_index = list(enumerate(attribute))
    
    
    for atr in s:
        p_atr = attribute.count(atr) / len(attribute)
        label_atr = []
        for j in range(len(labels)):
            if atr_index[j][1] == atr:
                label_atr.append(labels[j])
                
        ent_atr = Ent(label_atr)
        gain += p_atr*ent_atr
    
    return info_gain - gain

def gain_rate(attribute, labels):
    info_gain = Ent(labels)
    s = set(attribute)
    gain = 0
    IV_atr = 0
    atr_index = list(enumerate(attribute))
    
    
    for atr in s:
        p_atr = attribute.counts(atr) / len(attribute)
        label_atr = []
        for j in range(len(labels)):
            if atr_index[j][1] == atr:
                label_atr.append(labels[j])
                
        ent_atr = Ent(label_atr)
        gain += p_atr*ent_atr
        IV_atr += p_atr*np.log2(p_atr)
    
    gain_ratio = (info_gain - gain) / IV_atr
    
    return gain_ratio



    
class TreeNode:
    current_index = 0
    def __init__(self, parent=None, attr_name=None, children=None, judge=None,  data_index=None,
                 attr_value=None, rest_attribute=None):
        """
        决策树结点类初始化方法
        :param parent: 父节点
        """
        self.parent = parent  
        self.attribute_name = attr_name 
        self.attribute_value = attr_value  
        self.children = children 
        self.judge = judge  
        self.data_index = data_index  
        self.index = TreeNode.current_index  
        self.rest_attribute = rest_attribute  
        TreeNode.current_index += 1
 
    def to_string(self):
        """
        用一个字符串来描述当前结点信息
        一个普通结点的描述--------------------------------------------
        current index : x;
        parent index : xx;
        parent node's attribute(父亲结点的属性划分（比方说色泽）)：青黑（此结点的色泽）
        data：包含的训练数据
        selected attribute(选择的属性)：xx
        children(子节点)：x x x 
        """
        this_string = 'current index : ' + str(self.index) + ";\n"
        if not (self.parent is None):
            parent_node = self.parent
            this_string = this_string + 'parent index : ' + str(parent_node.index) + ";\n"
            this_string = this_string + str(parent_node.attribute_name) + " : " + str(self.attribute_value) + ";\n"
        this_string = this_string + "data : " + str(self.data_index) + ";\n"
        if not(self.children is None):
            this_string = this_string + 'selected attribute is : ' + str(self.attribute_name) + ";\n"
            child_list = []
            for child in self.children:
                child_list.append(child.index)
            this_string = this_string + 'children : ' + str(child_list)
        if not (self.judge is None):
            this_string = this_string + 'label : ' + self.judge
        return this_string
    
    

def finish_node(current_node, X, label, split_type):
    one_class = True
 
    this_data_index = current_node.data_index
    for i in this_data_index:
        for j in this_data_index:
            if label[i] != label[j]:
                one_class = False
                break
        if not one_class:
            break
    if one_class:
        
        current_node.judge = label[this_data_index[0]]
        return
    
    rest_title = current_node.rest_attribute 
    if len(rest_title) == 0: 
        label_count = {}
        temp_data = current_node.data_index 
        for index in temp_data:
            if label[index] in label_count:
                label_count[label[index]] += 1
            else:
                label_count[label[index]] = 1
        final_label = max(label_count)
        current_node.judge = final_label
        return
    
    
    attribute_eva = {}  # 记录每个属性的评价(1.信息增益 2.增益率 3.基尼指数)
    for attr in rest_title: #挑选属性
        attr_values = []
        current_label = []
        for index in current_node.data_index:
            this_data = X[index]
            attr_values.append(this_data[attr]) 
            current_label.append(label[index])   
        
        
        if attr not in ['密度', '含糖率']:
            this_gain = split_selection(attr_values, current_label, split_type)  
            attribute_eva[attr] = this_gain
        else:
            x_feature = sorted(set(attr_values)) 
            feature_values = []
            for i in range(len(x_feature) - 1):  # 计算划分点
                feature_values.append((float(x_feature[i]) + float(
                    x_feature[i + 1])) / 2)

            for threshold in feature_values:
                attr_left = []
                label_left = []
                label_right = []
                attr_right = []
                min_entropy = 100
                best_threshold = 0
                for i in range(len(attr_values)):
                    if attr_values[i] < threshold:
                        attr_left.append(attr_values[i])
                        label_left.append(current_label[i])
                    else:
                        attr_right.append(attr_values[i])
                        label_right.append(current_label[i])
                
                prob_left = len(attr_left) / len(attr_values)
                prob_right = len(attr_right) / len(attr_values)
                entropy = prob_left * Ent(label_left) + prob_right * Ent(label_right)
                
                if entropy < min_entropy:
                    min_entropy = entropy
                    best_threshold = threshold
                    
            this_gain = Ent(current_label) - min_entropy
            attribute_eva[attr] = this_gain
            
    print(attribute_eva)
    best_attr = max(attribute_eva, key=attribute_eva.get)  
    current_node.attribute_name = best_attr
    rest_title.remove(best_attr)
 
    

    # 属性划分
    best_titlevalue_dict = {}  # key是属性值的取值，value是个list记录所包含的样本序号
    for index in current_node.data_index:
        this_data = X[index] 
        if this_data[best_attr] in best_titlevalue_dict:
            temp_list = best_titlevalue_dict[this_data[best_attr]]
            temp_list.append(index)
        else:                                
            temp_list = [index]
            best_titlevalue_dict[this_data[best_attr]] = temp_list
 
    children_list = []
    for key, index_list in best_titlevalue_dict.items():
        a_child = TreeNode(parent=current_node, data_index=index_list, attr_value=key, rest_attribute=rest_title.copy())
        children_list.append(a_child)
    current_node.children = children_list
 
   
    for child in current_node.children:  
        finish_node(child, X, label, split_type)

def id3_tree(X, title, label, split_type):
    n = len(X)
    root_data = []
    for i in range(0, n):
        root_data.append(i)
        
    root_node = TreeNode(data_index=root_data, rest_attribute=title.copy())
    finish_node(root_node, X, label, split_type)
    
    return root_node
 
    
def print_tree(root=TreeNode()):
    """
    打印输出一颗树
    :param root: 根节点
    :return:
    """
    node_list = [root]
    while(len(node_list)>0):
        current_node = node_list[0]
        print('--------------------------------------------')
        print(current_node.to_string())
        print('--------------------------------------------')
        children_list = current_node.children
        if not (children_list is None):
            for child in children_list:
                node_list.append(child)
        node_list.remove(current_node)
        
        
        
data = [] 
label = []  
for sample in wm_data:
    a_dict = {}
    dim = len(sample) - 1
    for i in range(0, dim):
        a_dict[Attributes[i]] = sample[i]
    data.append(a_dict)
    label.append(sample[dim])
    
    
decision_tree = id3_tree(data, Attributes, label, 2)
print_tree(decision_tree)

{'色泽': 0.10812516526536531, '根蒂': 0.14267495956679288, '敲声': 0.14078143361499584, '纹理': 0.3805918973682686, '脐部': 0.28915878284167895, '触感': 0.006046489176565584, '含糖率': 0.06696192680347068, '密度': 0.06696192680347068}
{'色泽': 0.04306839587828004, '根蒂': 0.45810589515712374, '敲声': 0.33085622540971754, '脐部': 0.45810589515712374, '触感': 0.45810589515712374, '含糖率': 0.04306839587828004, '密度': 0.04306839587828004}
{'色泽': 0.2516291673878229, '敲声': 0.0, '脐部': 0.0, '触感': 0.2516291673878229, '含糖率': 0.2516291673878229, '密度': 0.9182958340544896}
{'色泽': 0.3219280948873623, '根蒂': 0.07290559532005603, '敲声': 0.3219280948873623, '脐部': 0.17095059445466865, '触感': 0.7219280948873623, '含糖率': 0.07290559532005603, '密度': 0.07290559532005603}
--------------------------------------------
current index : 1;
data : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
selected attribute is : 纹理;
children : [2, 3, 4]
--------------------------------------------
--------------------------------------------
curre

In [43]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.model_selection import train_test_split

wm_data = pd.read_csv(r"C:\Users\lenovo\Desktop\3.0.csv")
wm_data = wm_data.drop(['编号'], axis=1).values



Attributes = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '含糖率', '密度']


def Gini(y):
    unique, counts = np.unique(y, return_counts=True)
    freqs = counts / len(y)
    
    gini = 1 - np.sum(freqs**2)
    
    return gini


def split_selection(attribute, y, split_type):
    if split_type == 1:
        return Gini(y)
    
    if split_type == 2:
        return gain(attribute, y)
        
    if split_type == 3:
        return gain_rate(attribute, y)
    
    
def Ent(labels):
    label_name = []
    label_count = {}

    for label in labels:
        label_count[label] = label_count.get(label, 0) + 1
        entropy = 0

    for label in label_count:
        prob = label_count[label] / len(labels)
        entropy -= prob * np.log2(prob)
        
    return entropy

def gain(attribute, labels):
    info_gain = Ent(labels)
    s = set(attribute)
    gain = 0
    atr_index = list(enumerate(attribute))
    
    
    for atr in s:
        p_atr = attribute.count(atr) / len(attribute)
        label_atr = []
        for j in range(len(labels)):
            if atr_index[j][1] == atr:
                label_atr.append(labels[j])
                
        ent_atr = Ent(label_atr)
        gain += p_atr*ent_atr

        
    return info_gain - gain

def gain_rate(attribute, labels):
    info_gain = Ent(labels)
    s = set(attribute)
    gain = 0
    IV_atr = 0
    atr_index = list(enumerate(attribute))
  
    
    for atr in s:
        p_atr = attribute.count(atr) / len(attribute)
        label_atr = []
        for j in range(len(labels)):
            if atr_index[j][1] == atr:
                label_atr.append(labels[j])
                
        ent_atr = Ent(label_atr)
        gain += p_atr*ent_atr
        IV_atr -= p_atr*np.log2(p_atr)
    
    gain_ratio = (info_gain - gain) / IV_atr
    
    return gain_ratio



    
class TreeNode:
    current_index = 0
    def __init__(self, parent=None, attr_name=None, children=None, judge=None,  data_index=None,
                 attr_value=None, rest_attribute=None):
        """
        决策树结点类初始化方法
        :param parent: 父节点
        """
        self.parent = parent  
        self.attribute_name = attr_name 
        self.attribute_value = attr_value  
        self.children = children 
        self.judge = judge  
        self.data_index = data_index  
        self.index = TreeNode.current_index  
        self.rest_attribute = rest_attribute  
        TreeNode.current_index += 1
 
    def to_string(self):
        """
        用一个字符串来描述当前结点信息
        一个普通结点的描述--------------------------------------------
        current index : x;
        parent index : xx;
        parent node's attribute(父亲结点的属性划分（比方说色泽）)：青黑（此结点的色泽）
        data：包含的训练数据
        selected attribute(选择的属性)：xx
        children(子节点)：x x x 
        """
        this_string = 'current index : ' + str(self.index) + ";\n"
        if not (self.parent is None):
            parent_node = self.parent
            this_string = this_string + 'parent index : ' + str(parent_node.index) + ";\n"
            this_string = this_string + str(parent_node.attribute_name) + " : " + str(self.attribute_value) + ";\n"
        this_string = this_string + "data : " + str(self.data_index) + ";\n"
        if not(self.children is None):
            this_string = this_string + 'selected attribute is : ' + str(self.attribute_name) + ";\n"
            child_list = []
            for child in self.children:
                child_list.append(child.index)
            this_string = this_string + 'children : ' + str(child_list)
        if not (self.judge is None):
            this_string = this_string + 'label : ' + self.judge
        return this_string
    
    

def finish_node(current_node, X, label, split_type):
    one_class = True
 
    this_data_index = current_node.data_index
    for i in this_data_index:
        for j in this_data_index:
            if label[i] != label[j]:
                one_class = False
                break
        if not one_class:
            break
    if one_class:
        
        current_node.judge = label[this_data_index[0]]
        return
    
    rest_title = current_node.rest_attribute 
    if len(rest_title) == 0: 
        label_count = {}
        temp_data = current_node.data_index 
        for index in temp_data:
            if label[index] in label_count:
                label_count[label[index]] += 1
            else:
                label_count[label[index]] = 1
        final_label = max(label_count)
        current_node.judge = final_label
        return
    
    
    attribute_eva = {}  # 记录每个属性的评价(1.信息增益 2.增益率 3.基尼指数)
    for attr in rest_title: #挑选属性
        attr_values = []
        current_label = []
        for index in current_node.data_index:
            this_data = X[index]
            attr_values.append(this_data[attr]) 
            current_label.append(label[index])   
        
        
        if attr not in ['密度', '含糖率']:
            this_gain = split_selection(attr_values, current_label, split_type)  
            attribute_eva[attr] = this_gain
        else:
            x_feature = sorted(set(attr_values)) 
            feature_values = []
            for i in range(len(x_feature) - 1):  # 计算划分点
                feature_values.append((float(x_feature[i]) + float(
                    x_feature[i + 1])) / 2)

            
            for threshold in feature_values:
                attr_left = []
                label_left = []
                label_right = []
                attr_right = []
                min_entropy = 100
                best_threshold = 0
                for i in range(len(attr_values)):
                    if attr_values[i] < threshold:
                        attr_left.append(attr_values[i])
                        label_left.append(current_label[i])
                    else:
                        attr_right.append(attr_values[i])
                        label_right.append(current_label[i])
                
                prob_left = len(attr_left) / len(attr_values)
                prob_right = len(attr_right) / len(attr_values)
                entropy = prob_left * Ent(label_left) + prob_right * Ent(label_right)
                
                if entropy < min_entropy:
                    min_entropy = entropy
                    best_threshold = threshold
                    
            this_gain = Ent(current_label) - min_entropy
            attribute_eva[attr] = this_gain
                

    best_attr = max(attribute_eva, key=attribute_eva.get)  
    current_node.attribute_name = best_attr
    rest_title.remove(best_attr)
 
    

    # 属性划分
    best_titlevalue_dict = {}  # key是属性值的取值，value是个list记录所包含的样本序号
    for index in current_node.data_index:
        this_data = X[index] 
        if this_data[best_attr] in best_titlevalue_dict:
            temp_list = best_titlevalue_dict[this_data[best_attr]]
            temp_list.append(index)
        else:                                
            temp_list = [index]
            best_titlevalue_dict[this_data[best_attr]] = temp_list
 
    print(best_titlevalue_dict)
    children_list = []
    for key, index_list in best_titlevalue_dict.items():
        a_child = TreeNode(parent=current_node, data_index=index_list, attr_value=key, rest_attribute=rest_title.copy())
        children_list.append(a_child)
    current_node.children = children_list
 
   
    for child in current_node.children:  
        finish_node(child, X, label, split_type)

def id3_tree(X, title, label, split_type):
    n = len(X)
    root_data = []
    for i in range(0, n):
        root_data.append(i)
        
    root_node = TreeNode(data_index=root_data, rest_attribute=title.copy())
    finish_node(root_node, X, label, split_type)
    
    return root_node
 
    
def print_tree(root=TreeNode()):
    """
    打印输出一颗树
    :param root: 根节点
    :return:
    """
    node_list = [root]
    while(len(node_list)>0):
        current_node = node_list[0]
        print('--------------------------------------------')
        print(current_node.to_string())
        print('--------------------------------------------')
        children_list = current_node.children
        if not (children_list is None):
            for child in children_list:
                node_list.append(child)
        node_list.remove(current_node)
        
        
        
data = [] 
label = []  
for sample in wm_data:
    a_dict = {}
    dim = len(sample) - 1
    for i in range(0, dim):
        a_dict[Attributes[i]] = sample[i]
    data.append(a_dict)
    label.append(sample[dim])
    
decision_tree = id3_tree(data, Attributes, label, 3)
print_tree(decision_tree)

{'清晰': [0, 1, 2, 3, 4, 5, 7, 9, 14], '稍糊': [6, 8, 12, 13, 16], '模糊': [10, 11, 15]}
{'硬滑': [0, 1, 2, 3, 4, 7], '软粘': [5, 9, 14]}
{0.403: [5], 0.243: [9], 0.36: [14]}
{'软粘': [6], '硬滑': [8, 12, 13, 16]}
--------------------------------------------
current index : 1;
data : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
selected attribute is : 纹理;
children : [2, 3, 4]
--------------------------------------------
--------------------------------------------
current index : 2;
parent index : 1;
纹理 : 清晰;
data : [0, 1, 2, 3, 4, 5, 7, 9, 14];
selected attribute is : 触感;
children : [5, 6]
--------------------------------------------
--------------------------------------------
current index : 3;
parent index : 1;
纹理 : 稍糊;
data : [6, 8, 12, 13, 16];
selected attribute is : 触感;
children : [10, 11]
--------------------------------------------
--------------------------------------------
current index : 4;
parent index : 1;
纹理 : 模糊;
data : [10, 11, 15];
label : 否
-----------------------

In [23]:
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


class TreeNode(object):
    
    def __init__(self, model=None, C=None, left=None, right=None):
        self.model = model
        self.C = C
        self.left = left
        self.right = right

def trainLinear(linear, x, y):
    linear.fit(x, y)
    return linear

def binaryTrainSet(linear, x, y):
    x0 = []
    x1 = []
    y0 = []
    y1 = []
    p = linear.predict(x)
    for i in range(p.shape[0]):
        if p[i] <= 0:
            x0.append(x[i])
            y0.append(y[i])
        else:
            x1.append(x[i])
            y1.append(y[i])
    return np.array(x0), np.array(x1), np.array(y0), np.array(y1)

def score(linear, x, y):
    #计算线性模型linear的精度
    right = 0
    p = linear.predict(x)
    for i in range(p.shape[0]):
        if p[i]<=0 and y[i]==-1 or p[i]>0 and y[i]==1:
            right += 1
    return right / x.shape[0]
    
def treeGenerate(root, x, y, precision):
    root.model = LinearRegression()
    root.model = trainLinear(root.model, x, y)
    x0, x1, y0, y1 = binaryTrainSet(root.model, x, y)
    
    if len(x0)==0 or score(root.model, x0, y0)>= precision:
        root.left = TreeNode(C=-1)
    else:
        root.left = TreeNode()
        treeGenerate(root.left, x0, y0, precision)
    
    if len(x1)==0 or score(root.model, x1, y1) >= precision:
        root.right = TreeNode(C=1)
    else:
        root.right = TreeNode()
        treeGenerate(root.right, x1, y1, precision)

def predict(root, xs):
    if root.C is not None:
        return root.C
    else:
        if root.model.predict(np.expand_dims(xs, axis=0)) <= 0:
            return predict(root.left, xs)
        else:
            return predict(root.right, xs)

def evaluate(root, x, y):
    right = 0
    for i in range(x.shape[0]):
        if predict(root, x[i]) == y[i]:
            right += 1
    return right / x.shape[0]


wm_data = pd.read_csv(r"C:\Users\lenovo\Desktop\3.0.csv")
x = wm_data.drop(['编号', '色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '好瓜'], axis=1).values
wm_data['好瓜'] = wm_data['好瓜'].map({'是': 1, '否': 0})
y = wm_data["好瓜"].values

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=60090)
y_train[y_train == 0] = -1
y_test[y_test == 0] = -1
root = TreeNode()
treeGenerate(root, X_train, y_train, 0.96)

In [24]:
scoreTrain = evaluate(root, X_train, y_train)
scoreTest = evaluate(root, X_test, y_test)
print('训练集精度为:', round(scoreTrain,4))
print('测试集精度为:', round(scoreTest, 4))

训练集精度为: 1.0
测试集精度为: 0.8333
