In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.utils import shuffle

# 载入数据
dataset = np.loadtxt('../machine learning note/watermelon_3a.csv', delimiter=',')
X = dataset[:,1:3]
y = dataset[:,3]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=42)

# 绘制散点图
plt.scatter(X[y==0][:,0],X[y==0][:,1],color='red',s=100,label='Good')
plt.scatter(X[y==1][:,0],X[y==1][:,1],color='blue',s=100,label='Bad')
plt.xlabel('Density')
plt.ylabel('Sugar')
plt.legend()
plt.show()
        
# Sigmoid函数
def sigmoid(X,beta):
    return 1.0 / (1+np.exp(-np.dot(X,beta)))

# 分布率
def likelihood_sub(X,y,beta):
    return -y * np.dot(beta,X.T) + np.log(1 + np.exp(np.dot(beta,X.T)))

# 似然函数
def likelihood(X,y,beta):
    num = len(y)
    sum = 0
    for i in range(num):
        sum += likelihood_sub(X[i],y[i],beta)
    return sum

# 梯度下降每一次参数更新，对应公式3.30
def gradient_step(alpha,X,y,beta):
    prediction  = sigmoid(X,beta)
    error = prediction - y
    gradientStep = alpha * np.dot(X.T,error)    
    beta -= gradientStep
    return beta

# 批量梯度下降
def gradient_descent(X,y,beta,alpha,num_iterations):   
    for i in range(num_iterations):
        beta = gradient_step(alpha,X,y,beta)
        loss = likelihood(X,y,beta)
        # print('Iteration:',i,'Loss:',loss)
    return beta,loss

# 随机梯度下降（每次只沿着一个方向下降）
def stochastic_gradient_descent(X, y, beta, alpha, num_epochs):
    sample_size = X.shape[0]
    for i in range(num_epochs):
        X, y = shuffle(X, y)  # 打乱数据顺序
        for j in range(sample_size):
            beta = gradient_step(alpha, X[j], y[j], beta)
        loss = likelihood(X, y, beta)
    return beta, loss

# 预测函数
def predict(X,beta):
    sample_size = X.shape[0]
    y = np.zeros(sample_size)
    for i in range(sample_size):
        if sigmoid(X[i],beta) > 0.5:
            y[i] = 1
        else:
            y[i] = 0
    return y

# 计算准确率
def accuracy(y_true,y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

# 训练模型
feature_size = X_train.shape[1]
beta = np.zeros(feature_size + 1)
alpha = 0.1
num_iterations = 500
X_train = np.c_[X_train,np.ones((X_train.shape[0],1))]
X_test = np.c_[X_test,np.ones((X_test.shape[0],1))]
beta,loss = gradient_descent(X_train,y_train,beta,alpha,num_iterations)
print('beta:',beta)
predictions = predict(X_test,beta)
accuracy_score = accuracy(y_test,predictions)
print('accuracy_score:',accuracy_score)




    


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

class LogisticRegression:
    def __init__(self,fit_intercept=True,method='batch',
                 learning_rate=0.1,max_iter=1000,random_state=None):
        """
        逻辑回归分类器
        
        参数:
        - fit_intercept: 是否添加偏置项 (默认True)
        - method: 优化方法 ['batch'(批量梯度下降), 'stochastic'(随机梯度下降)] (默认'batch')
        - learning_rate: 学习率 (默认0.1)
        - max_iter: 最大迭代次数 (默认500)
        - random_state: 随机种子 (默认None)
        """
        self.fit_intercept = fit_intercept
        self.method = method
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.random_state = random_state
        self.beta = None
        self.loss_history = []

    def add_intercept(self,X):
        """设置偏置项(X,1)"""
        if self.fit_intercept:
            return np.c_[X,np.ones(X.shape[0])]
        else:
            return X
        
    def sigmoid(self,z):
        """sigmoid函数"""
        return 1.0 / (1.0 + np.exp(-z))
    
    def loss(self,X,y):
        """计算似然损失(3.27)"""
        z = X @ self.beta
        loss_terms = -y * z + np.log(1 + np.exp(-z))
        return np.sum(loss_terms)

    def batch_gradient_step(self,X,y):
        """批量梯度下降更新参数(3.30)""" 
        p = self.sigmoid(X @ self.beta)
        gradent = X.T @ (p - y)
        self.beta -= self.learning_rate * gradent

    def stochastic_gradient_step(self,X,y):
        """随机梯度下降更新参数(一次随机更新一个方向)"""
        for i in range(X.shape[0]):
             p = self.sigmoid(X[i] @ self.beta)
             gradent = X[i] * (p - y[i])
             self.beta -= self.learning_rate * gradent
        
    def fit(self,X,y):
        """
        训练模型
        
        参数:
        - X: 特征矩阵 (n_samples, n_features)
        - y: 标签向量 (n_samples,)
        """

        # 数据预处理(m个数据，n-1个特征)
        X = self.add_intercept(X)
        m,n = X.shape
        self.beta = np.zeros(n)

        # 设置随机种子
        if self.random_state is not None:
            np.random.seed(self.random_state)

        # 选择优化方法
        if self.method == 'batch':
            optimizer = self.batch_gradient_step
        elif self.method =='stochastic':
            optimizer = self.stochastic_gradient_step
        else:
            raise ValueError("method must be 'batch' or'stochastic'")

        # 训练模型
        for epoch in range(self.max_iter):
            # 随机梯度下降每次迭代后随机打乱数据
            if self.method == 'stochastic':
                X,y = shuffle(X,y)

            # 执行一次梯度下降更新
            optimizer(X,y)

            # 记录损失
            self.loss_history.append(self.loss(X,y))

        return self
    
    def predict_proba(self,X):
        """返回预测概率"""
        X = self.add_intercept(X)
        return self.sigmoid(X @ self.beta)
    
    def predict(self,X):
        """返回预测类别"""
        proba = self.predict_proba(X)
        return (proba >= 0.5).astype(int)

    def score(self,X,y):
        """计算准确率"""
        y_pred = self.predict(X)
        return np.mean(y_pred == y)
    
    def plot_loss_curve(self):
        """绘制损失变化图"""
        plt.plot(self.loss_history)
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.show()

    def plot_decision_boundary(self,X,y):
        """绘制决策边界"""
        if X.shape[1] != 2:
            raise ValueError("只支持二维特征可视化!")
        # 提取模型参数
        w1, w2, b = self.beta[0], self.beta[1], self.beta[2]
        
        # 生成网格点坐标
        x_min, x_max = X[:,0].min()-0.1, X[:,0].max()+0.1
        y_min, y_max = X[:,1].min()-0.1, X[:,1].max()+0.1
        xx = np.linspace(x_min, x_max, 100)
        
        # 计算决策边界直线方程：w1*x1 + w2*x2 + b = 0 → x2 = (-w1*x1 -b)/w2
        decision_line = (-w1 * xx - b) / w2
        
        # 创建画布
        plt.figure(figsize=(8,6))
        
        # 绘制原始数据点
        plt.scatter(X[y==0][:,0], X[y==0][:,1], color='red', 
                    edgecolor='k', s=100, label='Bad')
        plt.scatter(X[y==1][:,0], X[y==1][:,1], color='blue', 
                    edgecolor='k', s=100, label='Good')
        
        # 绘制决策边界
        plt.plot(xx, decision_line, 'k--', lw=2, 
                label=f'Decision Boundary\n{w1:.2f}x1 + {w2:.2f}x2 + {b:.2f} = 0')
        
        # 美化显示
        plt.xlabel("Density")
        plt.ylabel("Sugar")
        plt.title("Decision Boundary")
        plt.legend(bbox_to_anchor=(1, 0.5), loc='center left')
        plt.xlim(x_min, x_max)
        plt.ylim(y_min, y_max)
        plt.grid(alpha=0.3)
        plt.show()
        

if __name__ == '__main__':
    # 加载数据
    dataset = np.loadtxt('../machine learning note/watermelon_3a.csv', delimiter=',')
    X = dataset[:, 1:3]
    y = dataset[:, 3]

    # 划分数据集(0.3的测试集)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )

    # 初始化模型(批量梯度下降)
    model = LogisticRegression(
        method='batch',       
        learning_rate=0.1,
        max_iter=1000,
        random_state=42
    )

    # 训练模型
    model.fit(X_train, y_train)

    # 评估模型
    print("模型参数:", model.beta)
    print("测试集准确率:", model.score(X_test, y_test))

    # 损失可视化
    model.plot_loss_curve()

    # 决策边界可视化
    model.plot_decision_boundary(X_train, y_train)

            
            









In [None]:
import numpy as np
import matplotlib.pyplot as plt

dataset = np.loadtxt('../machine learning note/watermelon_3a.csv',delimiter=',')
X = dataset[:,1:3]
y = dataset[:,3]


# 计算均值
mean1 = np.mean(X[y == 1],axis = 0)
mean0 = np.mean(X[y == 0],axis = 0)

print(mean1,mean0)

In [None]:
# 计算类内散度矩阵Sw(公式3.33)
X0 = X[y == 0] - mean0
X1 = X[y == 1] - mean1
Sw = X0.T @ X0 + X1.T @ X1

# 计算LDA(公式3.39)
w = np.linalg.inv(Sw) @ (mean0 - mean1)

# 绘制数据集
plt.scatter(X[y == 1][:,0],X[y == 1][:,1],c='r',marker='o',label='正例')
plt.scatter(X[y == 0][:,0],X[y == 0][:,1],c='b',marker='o',label='反例')
plt.xlabel('dencity')
plt.ylabel('suger')
plt.title('LDA')

# 计算每个数据点在投影方向上的投影点(降维)
X_projected = X @ w

# 创建一个用于投影的二维数组
w_prime_2d = np.array([w[0], w[1]])
X_projected_points = X_projected[:, np.newaxis] * w_prime_2d

# 绘制投影点
plt.scatter(X_projected_points[y == 1][:, 0], X_projected_points[y == 1][:, 1], c='b', s=50, alpha=0.5, label='正例投影')
plt.scatter(X_projected_points[y == 0][:, 0], X_projected_points[y == 0][:, 1], c='r', s=50, alpha=0.5, label='反例投影')

# 绘制从数据点到投影点的线段
for i in range(len(X)):
    plt.plot([X[i][0], X_projected_points[i][0]], [X[i][1], X_projected_points[i][1]], color='gray', linestyle='--', alpha=0.5)

   



In [None]:
a = {'纹理': {'模糊': '否', '清晰': {'密度': {'<=0.3815': '否', '>0.3815': '是'}}, '稍糊': {'触感': {'软粘': '是', '硬滑': '否'}}}}
b = a['纹理']
print("b:", b)
c = b['稍糊']
print("c:", c)


b: {'模糊': '否', '清晰': {'密度': {'<=0.3815': '否', '>0.3815': '是'}}, '稍糊': {'触感': {'软粘': '是', '硬滑': '否'}}}
c: {'触感': {'软粘': '是', '硬滑': '否'}}


In [22]:
a = '>100'
b = a.strip('<=').strip('>')

print(b)

100


In [None]:
import matplotlib.pyplot as plt
from math import log
import pandas as pd
import numpy as np
import operator

#创建决策树
def createTree(dataset, features):
    '''
    @brief: create a decision tree by using the ID3 algorithm
    @param dataset: the dataset to be used for training
    @param features: the features to be used for training
    @return: the decision tree
    '''
    # 取出所有样本的标签
    classList = [example[-1] for example in dataset]
    # 如果所有样本的标签相同，则返回该标签
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    # 如果特征集为空，则返回出现次数最多的标签
    if len(dataset[0]) == 1:
        return majorityCnt(classList)
    # 选择最优特征进行数据集划分
    bestfeatureIndex, bestValue = chooseBestFeatureToSplit(dataset)
    bestFeatLabel = features[bestfeatureIndex]
    
    # 创建节点
    myTree = {bestFeatLabel: {}}
    # 使用副本避免修改原始列表
    subfeatures = features.copy()  
    # 连续特征
    if type(bestValue).__name__ == 'float':
        myTree[bestFeatLabel]['<=' + str(bestValue)] = createTree(splitDataSetByValue(dataset, bestfeatureIndex, bestValue, True), subfeatures)
        myTree[bestFeatLabel]['>' + str(bestValue)] = createTree(splitDataSetByValue(dataset, bestfeatureIndex, bestValue, False), subfeatures)
    # 离散特征
    else:
        # 去除当前特征
        del subfeatures[bestfeatureIndex]  # 在副本中删除当前特征
        # 取出当前特征的取值
        featValue = [example[bestfeatureIndex] for example in dataset]
        uniqueVals = set(featValue)
        # 递归每一个特征值
        for value in uniqueVals:
            myTree[bestFeatLabel][value] = createTree(splitDataSet(dataset, bestfeatureIndex, value), subfeatures)
    return myTree

# 计算类别中出现次数最多的元素
def majorityCnt(classList):
    # 创建一个字典{类标签:出现次数}
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys(): classCount[vote] = 0
        classCount[vote] += 1
    # 降序排序[(类标签,出现次数),(),()]
    sortedclassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedclassCount[0][0]

# 选择最优特征进行数据集划分
def chooseBestFeatureToSplit(dataset):
    numFeatures = len(dataset[0]) - 1
    baseEntropy = calcShannonEnt(dataset)
    bestInfoGain = 0
    bestFeature = -1
    bestValue = 0
    # 遍历所有特征
    for i in range(numFeatures):
        # 取出第i个特征
        featList = [example[i] for example in dataset]
        # 连续特征
        if type(featList[0]).__name__ == 'float':
            # 排序
            sortedfeatList = sorted(featList)
            splitList = []
            # 计算切分点
            for j in range(len(sortedfeatList) - 1):
                splitVal = (sortedfeatList[j] + sortedfeatList[j + 1]) / 2.0
                splitList.append(splitVal)
                
            # 计算信息增益
            for val in set(splitList):
                newEntropy = 0
                subDataSet1 = splitDataSetByValue(dataset, i, val, True)
                subDataSet2 = splitDataSetByValue(dataset, i, val, False)
                prob1 = len(subDataSet1) / float(len(dataset))
                newEntropy += prob1 * calcShannonEnt(subDataSet1)
                prob2 = len(subDataSet2) / float(len(dataset))
                newEntropy += prob2 * calcShannonEnt(subDataSet2)
                infoGain = baseEntropy - newEntropy
                if (infoGain > bestInfoGain):
                    bestInfoGain = infoGain
                    bestFeature = i
                    bestValue = val
        else:
            # 离散特征
            uniqueVals = set(featList)
            newEntropy = 0
            # 遍历所有取值
            for val in uniqueVals:
                subDataSet = splitDataSet(dataset, i, val)
                prob = len(subDataSet) / float(len(dataset))
                newEntropy += prob * calcShannonEnt(subDataSet)
            infoGain = baseEntropy - newEntropy
            if (infoGain > bestInfoGain):
                bestInfoGain = infoGain
                bestFeature = i
                bestValue = None
    return bestFeature, bestValue

# 根据特征值划分数据集
def splitDataSet(dataset, axis, val):
    retDataSet = []
    for featVec in dataset:
        if featVec[axis] == val:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis + 1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

# 根据特征值和方向划分数据集
def splitDataSetByValue(dataset, axis, val, direction):
    retDataSet = []
    for featVec in dataset:
        if direction:
            if featVec[axis] <= val:
                reducedFeatVec = featVec[:axis]
                reducedFeatVec.extend(featVec[axis + 1:])
                retDataSet.append(reducedFeatVec)
        else:
            if featVec[axis] > val:
                reducedFeatVec = featVec[:axis]
                reducedFeatVec.extend(featVec[axis + 1:])
                retDataSet.append(reducedFeatVec)
    return retDataSet

# 计算数据集信息熵
def calcShannonEnt(dataset):
    numexamples = len(dataset)
    labelCounts = {}
    for featVec in dataset:
        currentlabel = featVec[-1]
        if currentlabel not in labelCounts.keys():
            labelCounts[currentlabel] = 0
        labelCounts[currentlabel] += 1

    shannonEnt = 0
    for key in labelCounts:
        prop = float(labelCounts[key]) / numexamples
        shannonEnt -= prop * log(prop, 2)
    return shannonEnt

def predict(inputTree, features, testVec):
    '''
    @brief: predict the label of a test vector using a decision tree
    @param inputTree: the decision tree to be used for prediction
    @param features: the features to be used for training
    @param testVec: the test vector to be predicted
    @return: the predicted label of the test vector
    '''
    # 提取当前节点
    firstStr = list(inputTree.keys())[0]
    # 提取当前节点下的子节点
    secondDict = inputTree[firstStr]
    # 获取当前节点的特征标签
    featureIndex = features.index(firstStr)

    for key in secondDict.keys():
        # 处理连续特征（如 "<=0.5"）
        if type(key).__name__ == 'str' and ('<=' in key or '>' in key):
            # 移除字符串中的符号，取出阈值
            threshold = float(key.replace('<=', '').replace('>', ''))
            # 当前特征值小于等于阈值，则进入左子树
            if key.startswith('<=') and testVec[featureIndex] <= threshold:
                childTree = secondDict[key]
                # 判断是否为内部节点，若是，则表示不是叶子节点，继续递归
                if isinstance(childTree, dict):
                    return predict(childTree, features, testVec)
                else:
                    return childTree
            elif key.startswith('>') and testVec[featureIndex] > threshold:
                childTree = secondDict[key]
                if isinstance(childTree, dict):
                    return predict(childTree, features, testVec)
                else:
                    return childTree
        # 处理离散特征（如 "硬滑"）
        else:
            if testVec[featureIndex] == key:
                childTree = secondDict[key]
                if isinstance(childTree, dict):
                    return predict(childTree, features, testVec)
                else:
                    return childTree
    # 若未匹配任何分支
    return "未知类别"  

if __name__ == '__main__':
    # 构建数据集
    df = pd.DataFrame(pd.read_csv("../Data/watermelon3.0.csv", encoding="ansi"))
    df.drop(labels=["编号"], axis=1, inplace=True)  # 删除编号这一列，inplace=True表示直接在原对象修改
    # 转化为列表
    dataset = df.values.tolist()
    # 打印原始数据
    # for i in range(len(dataset)):
    #     print(dataset[i])
    # 标签
    labels = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '密度', '含糖率']
    # 构建决策树
    myTree = createTree(dataset, labels)
    # 打印决策树
    print(myTree)
    # 测试数据
    testVec = ['青绿','硬挺','清脆','稍糊','平坦','软粘',0.243,0.267]
    # 预测结果
    result = predict(myTree, labels, testVec)
    # print(result)

In [1]:
import math     #导入一系列数学函数
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import *
 
decisionNodeStyle = dict(boxstyle = "sawtooth", fc = "0.8")
leafNodeStyle = dict(boxstyle = "round4", fc = "0.8")
arrowArgs = dict(arrowstyle="<-")
 
# 设置显示中文字体
mpl.rcParams["font.sans-serif"] = ["SimHei"]
# 设置正常显示符号
mpl.rcParams["axes.unicode_minus"] = False
 
#创建数据集
def createDataLH():
    data = np.array([['青年','否','否','一般']])
    data = np.append(data, [['青年', '否', '否', '好']
                            ,['青年', '是', '否', '好'] 
                            , ['青年', '是', '是', '一般']
                            , ['青年', '否', '否', '一般']
                            , ['中年', '否', '否', '一般']
                            , ['中年', '否', '否', '好']
                            , ['中年', '是', '是', '好']
                            , ['中年', '否', '是', '非常好']
                            , ['中年', '否', '是', '非常好']
                            , ['老年', '否', '是', '非常好']
                            , ['老年', '否', '是', '好']
                            , ['老年', '是', '否', '好']
                            , ['老年', '是', '否', '非常好']
                            , ['老年', '否', '否', '一般']
                           ], axis = 0)
    label = np.array(['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'])
    name = np.array(['年龄', '有工作', '有房子', '信贷情况'])
    return data,label,name
 
def createDataXG20():
    data = np.array([['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
                    , ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑']
                    , ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
                    , ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑']
                    , ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
                    , ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘']
                    , ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘']
                    , ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑']
                    , ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑']
                    , ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘']
                    , ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑']
                    , ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘']
                    , ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑']
                    , ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑']
                    , ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘']
                    , ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑']
                    , ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑']])
    label = np.array(['是', '是', '是', '是', '是', '是', '是', '是', '否', '否', '否', '否', '否', '否', '否', '否', '否'])
    name = np.array(['色泽', '根蒂', '敲声', '纹理', '脐部', '触感'])
    return data,label,name
 
def createDataMine():
    raw_data = pd.read_excel(r'data.xlsx',header=0)
    data = raw_data.values[:,1:5]
    #使用pandas.cut实现对数据的离散化
    data[:,0] = pd.cut(data[:,0],[0,300,800,1200,1400],labels=False)
    data[:,1] = pd.cut(data[:,1],[0,1000,1300,1600,2000],labels=False)
    data[:,2] = pd.cut(data[:,2],[0,5000,8000,10000,12000],labels=False)
    myData = data[:,0:3]
    myLabel = data[:,-1]
    myData = myData.astype(str)
    myLabel = myLabel.astype(str)
    #print(myData.dtype)
    #print(myLabel.dtype)
    myName = ["住宿费","月平均花费","家庭平均收入"]
    for i in range(myData.shape[0]):
        for j in range(myData.shape[1]):
            if(myData[i][j]=='1'):
                myData[i][j]='低'
            if(myData[i][j]=='2'):
                myData[i][j]='中'
            if(myData[i][j]=='3'):
                myData[i][j]='高'
    for k in range(len(myLabel)):
        if(myLabel[k]=='0'):
            myLabel[k]='否'
        if(myLabel[k]=='1'):
            myLabel[k]='是'
    #print(myData)
    #print(myLabel)
    return myData,myLabel,myName
 
def splitXgData20(xgData, xgLabel):
    xgDataTrain = xgData[[0, 1, 2, 5, 6, 9, 13, 14, 15, 16],:]
    xgDataTest = xgData[[3, 4, 7, 8, 10, 11, 12],:]
    xgLabelTrain = xgLabel[[0, 1, 2, 5, 6, 9, 13, 14, 15, 16]]
    xgLabelTest = xgLabel[[3, 4, 7, 8, 10, 11, 12]]
    return xgDataTrain, xgLabelTrain, xgDataTest, xgLabelTest
 
def splitMyData(myData,myLabel):
    myDataTest = np.empty((0,3),dtype=int)
    myLabelTest = np.empty((0),dtype=int)
    index = 4
    #print(myData.shape)
    for i in range(int(myData.shape[0]*0.3)):
        #print(index)
        #print(myData[index,:])
        myDataTest = np.append(myDataTest,[myData[index,:]],axis=0)
        myData = np.delete(myData,index,axis=0)
        myLabelTest = np.append(myLabelTest,myLabel[index])
        myLabel = np.delete(myLabel,[index])
        index += 2
    #print(myData.shape[0])
    #print(myDataTest)
    #print(myLabelTest)
    return myData,myLabel,myDataTest,myLabelTest
 
 
 
# 定义一个常用函数 用来求numpy array中数值等于某值的元素数量
equalNums = lambda x,y: 0 if x is None else x[x==y].size
 
 
# 定义计算信息熵的函数
def singleEntropy(x):
    """计算一个输入序列的信息熵"""
    # 转换为 numpy 矩阵
    x = np.asarray(x)
    # 取所有不同值
    xValues = set(x)
    # 计算熵值
    entropy = 0
    for xValue in xValues:
        p = equalNums(x, xValue) / x.size 
        entropy -= p * math.log(p, 2)
    return entropy
    
    
# 定义计算条件信息熵的函数
def conditionnalEntropy(feature, y):
    """计算 某特征feature 条件下y的信息熵"""
    # 转换为numpy 
    feature = np.asarray(feature)
    y = np.asarray(y)
    # 取特征的不同值
    featureValues = set(feature)
    # 计算熵值 
    entropy = 0
    for feat in featureValues:
        # 解释：feature == feat 是得到取feature中所有元素值等于feat的元素的索引（类似这样理解）
        #       y[feature == feat] 是取y中 feature元素值等于feat的元素索引的 y的元素的子集
        p = equalNums(feature, feat) / feature.size 
        entropy += p * singleEntropy(y[feature == feat])
    return entropy
    
    
# 定义信息增益
def infoGain(feature, y):
    return singleEntropy(y) - conditionnalEntropy(feature, y)
 
 
# 定义信息增益率
def infoGainRatio(feature, y):
    return 0 if singleEntropy(feature) == 0 else infoGain(feature, y) / singleEntropy(feature)
 
'''
# 使用李航数据测试函数 p62
lhData, lhLabel, lhName = createDataLH()
print("书中H(D)为0.971，函数结果：" + str(round(singleEntropy(lhLabel), 3)))  
print("书中g(D, A1)为0.083，函数结果：" + str(round(infoGain(lhData[:,0] ,lhLabel), 3)))  
print("书中g(D, A2)为0.324，函数结果：" + str(round(infoGain(lhData[:,1] ,lhLabel), 3)))  
print("书中g(D, A3)为0.420，函数结果：" + str(round(infoGain(lhData[:,2] ,lhLabel), 3)))  
print("书中g(D, A4)为0.363，函数结果：" + str(round(infoGain(lhData[:,3] ,lhLabel), 3)))  
# 测试正常，与书中结果一致
# 使用西瓜数据测试函数  p75-p77
xgData, xgLabel, xgName = createDataXG20()
print("书中Ent(D)为0.998，函数结果：" + str(round(singleEntropy(xgLabel), 4)))  
print("书中Gain(D, 色泽)为0.109，函数结果：" + str(round(infoGain(xgData[:,0] ,xgLabel), 4)))  
print("书中Gain(D, 根蒂)为0.143，函数结果：" + str(round(infoGain(xgData[:,1] ,xgLabel), 4)))  
print("书中Gain(D, 敲声)为0.141，函数结果：" + str(round(infoGain(xgData[:,2] ,xgLabel), 4)))  
print("书中Gain(D, 纹理)为0.381，函数结果：" + str(round(infoGain(xgData[:,3] ,xgLabel), 4)))  
print("书中Gain(D, 脐部)为0.289，函数结果：" + str(round(infoGain(xgData[:,4] ,xgLabel), 4)))  
print("书中Gain(D, 触感)为0.006，函数结果：" + str(round(infoGain(xgData[:,5] ,xgLabel), 4)))
'''
 
# 特征选取
def bestFeature(data, labels, method = 'id3'):
    assert method in ['id3', 'c45'], "method 须为id3或c45"
    data = np.asarray(data)
    labels = np.asarray(labels)
    # 根据输入的method选取 评估特征的方法：id3 -> 信息增益; c45 -> 信息增益率
    def calcEnt(feature, labels):
        if method == 'id3':
            return infoGain(feature, labels)
        elif method == 'c45' :
            return infoGainRatio(feature, labels)
    # 特征数量  即 data 的列数量
    featureNum = data.shape[1]
    # 计算最佳特征
    bestEnt = 0 
    bestFeat = -1
    for feature in range(featureNum):
        ent = calcEnt(data[:, feature], labels)
        if ent >= bestEnt:
            bestEnt = ent 
            bestFeat = feature
        # print("feature " + str(feature + 1) + " ent: " + str(ent)+ "\t bestEnt: " + str(bestEnt))
    return bestFeat, bestEnt 
 
 
# 根据特征及特征值分割原数据集  删除data中的feature列，并根据feature列中的值分割 data和label
def splitFeatureData(data, labels, feature):
    """feature 为特征列的索引"""
    # 取特征列
    features = np.asarray(data)[:,feature]
    # 数据集中删除特征列
    data = np.delete(np.asarray(data), feature, axis = 1)
    # 标签
    labels = np.asarray(labels)
    
    uniqFeatures = set(features)
    dataSet = {}
    labelSet = {}
    for feat in uniqFeatures:
        dataSet[feat] = data[features == feat]
        labelSet[feat] = labels[features == feat]
    return dataSet, labelSet
    
    
# 多数投票 
def voteLabel(labels):
    uniqLabels = list(set(labels))
    labels = np.asarray(labels)
 
    finalLabel = 0
    labelNum = []
    for label in uniqLabels:
        # 统计每个标签值得数量
        labelNum.append(equalNums(labels, label))
    # 返回数量最大的标签
    return uniqLabels[labelNum.index(max(labelNum))]
 
 
# 创建决策树
def createTree(data, labels, names, method = 'id3'):
    data = np.asarray(data)
    labels = np.asarray(labels)
    names = np.asarray(names)
    # 如果结果为单一结果
    if len(set(labels)) == 1: 
        return labels[0] 
    # 如果没有待分类特征
    elif data.size == 0: 
        return voteLabel(labels)
    # 其他情况则选取特征 
    bestFeat, bestEnt = bestFeature(data, labels, method = method)
    # 取特征名称
    bestFeatName = names[bestFeat]
    # 从特征名称列表删除已取得特征名称
    names = np.delete(names, [bestFeat])
    # 根据选取的特征名称创建树节点
    decisionTree = {bestFeatName: {}}
    # 根据最优特征进行分割
    dataSet, labelSet = splitFeatureData(data, labels, bestFeat)
    # 对最优特征的每个特征值所分的数据子集进行计算
    for featValue in dataSet.keys():
        decisionTree[bestFeatName][featValue] = createTree(dataSet.get(featValue), labelSet.get(featValue), names, method)
    return decisionTree 
 
 
# 树信息统计 叶子节点数量 和 树深度
def getTreeSize(decisionTree):
    nodeName = list(decisionTree.keys())[0]
    nodeValue = decisionTree[nodeName]
    leafNum = 0
    treeDepth = 0 
    leafDepth = 0
    for val in nodeValue.keys():
        if type(nodeValue[val]) == dict:
            leafNum += getTreeSize(nodeValue[val])[0]
            leafDepth = 1 + getTreeSize(nodeValue[val])[1] 
        else :
            leafNum += 1 
            leafDepth = 1 
        treeDepth = max(treeDepth, leafDepth)
    return leafNum, treeDepth 
 
 
# 使用模型对其他数据分类
def dtClassify(decisionTree, rowData, names):
    names = list(names)
    # 获取特征
    feature = list(decisionTree.keys())[0]
    # 决策树对于该特征的值的判断字段
    featDict = decisionTree[feature]
    # 获取特征的列
    feat = names.index(feature)
    # 获取数据该特征的值
    featVal = rowData[feat]
    # 根据特征值查找结果，如果结果是字典说明是子树，调用本函数递归
    if featVal in featDict.keys():
        if type(featDict[featVal]) == dict:
            classLabel = dtClassify(featDict[featVal], rowData, names)
        else:
            classLabel = featDict[featVal] 
    return classLabel
 
#获取叶节点的数目和树的层数
def getNumLeafs(tree):
    numLeafs = 0
    #获取第一个节点的分类特征
    firstFeat = list(tree.keys())[0]
    #得到firstFeat特征下的决策树（以字典方式表示）
    secondDict = tree[firstFeat]
    #遍历firstFeat下的每个节点
    for key in secondDict.keys():
        #如果节点类型为字典，说明该节点下仍然是一棵树，此时递归调用getNumLeafs
        if type(secondDict[key]).__name__== 'dict':
            numLeafs += getNumLeafs(secondDict[key])
        #否则该节点为叶节点
        else:
            numLeafs += 1
    return numLeafs
 
#获取决策树深度
def getTreeDepth(tree):
    maxDepth = 0
    #获取第一个节点分类特征
    firstFeat = list(tree.keys())[0]
    #得到firstFeat特征下的决策树（以字典方式表示）
    secondDict = tree[firstFeat]
    #遍历firstFeat下的每个节点，返回子树中的最大深度
    for key in secondDict.keys():
        #如果节点类型为字典，说明该节点下仍然是一棵树，此时递归调用getTreeDepth，获取该子树深度
        if type(secondDict[key]).__name__ == 'dict':
            thisDepth = 1 + getTreeDepth(secondDict[key])
        else:
            thisDepth = 1
        if thisDepth > maxDepth:
            maxDepth = thisDepth
    return maxDepth
 
 
    #画出决策树
def createPlot(tree):
    # 定义一块画布，背景为白色
    fig = plt.figure(1, facecolor='white')
    # 清空画布
    fig.clf()
    # 不显示x、y轴刻度
    xyticks = dict(xticks=[], yticks=[])
    # frameon：是否绘制坐标轴矩形
    createPlot.pTree = plt.subplot(111, frameon=False, **xyticks)
    # 计算决策树叶子节点个数
    plotTree.totalW = float(getNumLeafs(tree))
    # 计算决策树深度
    plotTree.totalD = float(getTreeDepth(tree))
    # 最近绘制的叶子节点的x坐标
    plotTree.xOff = -0.5 / plotTree.totalW
    # 当前绘制的深度：y坐标
    plotTree.yOff = 1.0
    # （0.5,1.0）为根节点坐标
    plotTree(tree, (0.5, 1.0), '')
    plt.show()
 
 
 
 
# nodeText:要显示的文本；centerPt：文本中心点，即箭头所在的点；parentPt：指向文本的点；nodeType:节点属性
# ha='center'，va='center':水平、垂直方向中心对齐；bbox：方框属性
# arrowprops：箭头属性
# xycoords，textcoords选择坐标系；axes fraction-->0,0是轴域左下角，1,1是右上角
def plotNode(nodeText, centerPt, parentPt, nodeType):
    createPlot.pTree.annotate(nodeText, xy=parentPt, xycoords="axes fraction",
                              xytext=centerPt, textcoords='axes fraction',
                              va='center', ha='center', bbox=nodeType, arrowprops=arrowArgs)
def plotMidText(centerPt, parentPt, midText):
    xMid = (parentPt[0] - centerPt[0]) / 2.0 + centerPt[0]
    yMid = (parentPt[1] - centerPt[1]) / 2.0 + centerPt[1]
    createPlot.pTree.text(xMid, yMid, midText)
 
def plotTree(tree, parentPt, nodeTxt):
    #计算叶子节点个数
    numLeafs = getNumLeafs(tree)
    #获取第一个节点特征
    firstFeat = list(tree.keys())[0]
    #计算当前节点的x坐标
    centerPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)
    #绘制当前节点
    plotMidText(centerPt,parentPt,nodeTxt)
    plotNode(firstFeat,centerPt,parentPt,decisionNodeStyle)
    secondDict = tree[firstFeat]
    #计算绘制深度
    plotTree.yOff -= 1.0/plotTree.totalD
    for key in secondDict.keys():
        #如果当前节点的子节点不是叶子节点，则递归
        if type(secondDict[key]).__name__ == 'dict':
            plotTree(secondDict[key],centerPt,str(key))
        #如果当前节点的子节点是叶子节点，则绘制该叶节点
        else:
            #plotTree.xOff在绘制叶节点坐标的时候才会发生改变
            plotTree.xOff += 1.0/plotTree.totalW
            plotNode(secondDict[key], (plotTree.xOff,plotTree.yOff),centerPt,leafNodeStyle)
            plotMidText((plotTree.xOff,plotTree.yOff),centerPt,str(key))
    plotTree.yOff += 1.0/plotTree.totalD
 
 
 
# 使用李航数据测试函数 p62
lhData, lhLabel, lhName = createDataLH()
lhTree = createTree(lhData, lhLabel, lhName, method = 'id3')
#print(lhTree)
#createPlot(lhTree)
 
 
 
# 使用西瓜数据测试函数  p75-p77
xgData, xgLabel, xgName = createDataXG20()
xgTree = createTree(xgData, xgLabel, xgName, method = 'id3')
#print(xgTree)
#createPlot(xgTree)
 
 
 
#使用自己的数据集测试函数
myData,myLabel,myName = createDataMine()
myTree = createTree(myData,myLabel,myName,method='id3')
#print(myTree)
#createPlot(myTree)
 
 
# 创建预剪枝决策树
def createTreePrePruning(dataTrain, labelTrain, dataTest, labelTest, names, method = 'id3'):
    """
    预剪枝 需要使用测试数据对每次的划分进行评估
         策略说明：原本如果某节点划分前后的测试结果没有提升，根据奥卡姆剃刀原则将不进行划分（即执行剪枝），但考虑到这种策略容易造成欠拟合，
                   且不能排除后续划分有进一步提升的可能，因此，没有提升仍保留划分，即不剪枝
         另外：周志华的书上评估的是某一个节点划分前后对该层所有数据综合评估，如评估对脐部 凹陷下色泽是否划分，
               书上取的色泽划分前的精度是71.4%(5/7)，划分后的精度是57.1%(4/7)，都是脐部下三个特征（凹陷，稍凹，平坦）所有的数据的精度，计算也不易
               而我觉得实际计算时，只对当前节点下的数据划分前后进行评估即可，如脐部凹陷时有三个测试样本，
               三个样本色泽划分前的精度是2/3=66.7%，色泽划分后的精度是1/3=33.3%，因此判断不划分
    """
    trainData = np.asarray(dataTrain)
    labelTrain = np.asarray(labelTrain)
    testData = np.asarray(dataTest)
    labelTest = np.asarray(labelTest)
    names = np.asarray(names)
    # 如果结果为单一结果
    if len(set(labelTrain)) == 1: 
        return labelTrain[0] 
    # 如果没有待分类特征
    elif trainData.size == 0: 
        return voteLabel(labelTrain)
    # 其他情况则选取特征 
    bestFeat, bestEnt = bestFeature(dataTrain, labelTrain, method = method)
    # 取特征名称
    bestFeatName = names[bestFeat]
    # 从特征名称列表删除已取得特征名称
    names = np.delete(names, [bestFeat])
    # 根据最优特征进行分割
    dataTrainSet, labelTrainSet = splitFeatureData(dataTrain, labelTrain, bestFeat)
 
    # 预剪枝评估
    # 划分前的分类标签
    labelTrainLabelPre = voteLabel(labelTrain)
    labelTrainRatioPre = equalNums(labelTrain, labelTrainLabelPre) / labelTrain.size
    # 划分后的精度计算 
    if dataTest is not None: 
        dataTestSet, labelTestSet = splitFeatureData(dataTest, labelTest, bestFeat)
        # 划分前的测试标签正确比例
        labelTestRatioPre = equalNums(labelTest, labelTrainLabelPre) / labelTest.size
        # 划分后 每个特征值的分类标签正确的数量
        labelTrainEqNumPost = 0
        for val in labelTrainSet.keys():
            labelTrainEqNumPost += equalNums(labelTestSet.get(val), voteLabel(labelTrainSet.get(val))) + 0.0
        # 划分后 正确的比例
        labelTestRatioPost = labelTrainEqNumPost / labelTest.size 
    
    # 如果没有评估数据 但划分前的精度等于最小值0.5 则继续划分
    if dataTest is None and labelTrainRatioPre == 0.5:
        decisionTree = {bestFeatName: {}}
        for featValue in dataTrainSet.keys():
            decisionTree[bestFeatName][featValue] = createTreePrePruning(dataTrainSet.get(featValue), labelTrainSet.get(featValue)
                                      , None, None, names, method)
    elif dataTest is None:
        return labelTrainLabelPre 
    # 如果划分后的精度相比划分前的精度下降, 则直接作为叶子节点返回
    elif labelTestRatioPost < labelTestRatioPre:
        return labelTrainLabelPre
    else :
        # 根据选取的特征名称创建树节点
        decisionTree = {bestFeatName: {}}
        # 对最优特征的每个特征值所分的数据子集进行计算
        for featValue in dataTrainSet.keys():
            decisionTree[bestFeatName][featValue] = createTreePrePruning(dataTrainSet.get(featValue), labelTrainSet.get(featValue)
                                      , dataTestSet.get(featValue), labelTestSet.get(featValue)
                                      , names, method)
    return decisionTree 
 
 
# 创建决策树 带预划分标签
def createTreeWithLabel(data, labels, names, method = 'id3'):
    data = np.asarray(data)
    labels = np.asarray(labels)
    names = np.asarray(names)
    # 如果不划分的标签为
    votedLabel = voteLabel(labels)
    # 如果结果为单一结果
    if len(set(labels)) == 1: 
        return votedLabel 
    # 如果没有待分类特征
    elif data.size == 0: 
        return votedLabel
    # 其他情况则选取特征 
    bestFeat, bestEnt = bestFeature(data, labels, method = method)
    # 取特征名称
    bestFeatName = names[bestFeat]
    # 从特征名称列表删除已取得特征名称
    names = np.delete(names, [bestFeat])
    # 根据选取的特征名称创建树节点 划分前的标签votedPreDivisionLabel=_vpdl
    decisionTree = {bestFeatName: {"_vpdl": votedLabel}}
    # 根据最优特征进行分割
    dataSet, labelSet = splitFeatureData(data, labels, bestFeat)
    # 对最优特征的每个特征值所分的数据子集进行计算
    for featValue in dataSet.keys():
        decisionTree[bestFeatName][featValue] = createTreeWithLabel(dataSet.get(featValue), labelSet.get(featValue), names, method)
    return decisionTree 
 
 
# 将带预划分标签的tree转化为常规的tree
# 函数中进行的copy操作，原因见有道笔记 【YL20190621】关于Python中字典存储修改的思考
def convertTree(labeledTree):
    labeledTreeNew = labeledTree.copy()
    nodeName = list(labeledTree.keys())[0]
    labeledTreeNew[nodeName] = labeledTree[nodeName].copy()
    for val in list(labeledTree[nodeName].keys()):
        if val == "_vpdl": 
            labeledTreeNew[nodeName].pop(val)
        elif type(labeledTree[nodeName][val]) == dict:
            labeledTreeNew[nodeName][val] = convertTree(labeledTree[nodeName][val])
    return labeledTreeNew
 
 
# 后剪枝 训练完成后决策节点进行替换评估  这里可以直接对xgTreeTrain进行操作
def treePostPruning(labeledTree, dataTest, labelTest, names):
    newTree = labeledTree.copy()
    dataTest = np.asarray(dataTest)
    labelTest = np.asarray(labelTest)
    names = np.asarray(names)
    # 取决策节点的名称 即特征的名称
    featName = list(labeledTree.keys())[0]
    #print("\n当前节点：" + featName)
    # 取特征的列
    featCol = np.argwhere(names==featName)[0][0]
    names = np.delete(names, [featCol])
    #print("当前节点划分的数据维度：" + str(names))
    #print("当前节点划分的数据：" )
    #print(dataTest)
    #print(labelTest)
    # 该特征下所有值的字典
    newTree[featName] = labeledTree[featName].copy()
    featValueDict = newTree[featName]
    featPreLabel = featValueDict.pop("_vpdl")
    #print("当前节点预划分标签：" + featPreLabel)
    # 是否为子树的标记
    subTreeFlag = 0
    # 分割测试数据 如果有数据 则进行测试或递归调用  np的array我不知道怎么判断是否None, 用is None是错的
    dataFlag = 1 if sum(dataTest.shape) > 0 else 0
    if dataFlag == 1:
        # print("当前节点有划分数据！")
        dataTestSet, labelTestSet = splitFeatureData(dataTest, labelTest, featCol)
    for featValue in featValueDict.keys():
        # print("当前节点属性 {0} 的子节点：{1}".format(featValue ,str(featValueDict[featValue])))
        if dataFlag == 1 and type(featValueDict[featValue]) == dict:
            subTreeFlag = 1 
            # 如果是子树则递归
            newTree[featName][featValue] = treePostPruning(featValueDict[featValue], dataTestSet.get(featValue), labelTestSet.get(featValue), names)
            # 如果递归后为叶子 则后续进行评估
            if type(featValueDict[featValue]) != dict:
                subTreeFlag = 0 
            
        # 如果没有数据  则转换子树
        if dataFlag == 0 and type(featValueDict[featValue]) == dict: 
            subTreeFlag = 1 
            # print("当前节点无划分数据！直接转换树："+str(featValueDict[featValue]))
            newTree[featName][featValue] = convertTree(featValueDict[featValue])
            # print("转换结果：" + str(convertTree(featValueDict[featValue])))
    # 如果全为叶子节点， 评估需要划分前的标签，这里思考两种方法，
    #     一是，不改变原来的训练函数，评估时使用训练数据对划分前的节点标签重新打标
    #     二是，改进训练函数，在训练的同时为每个节点增加划分前的标签，这样可以保证评估时只使用测试数据，避免再次使用大量的训练数据
    #     这里考虑第二种方法 写新的函数 createTreeWithLabel，当然也可以修改createTree来添加参数实现
    if subTreeFlag == 0:
        ratioPreDivision = equalNums(labelTest, featPreLabel) / labelTest.size
        equalNum = 0
        for val in labelTestSet.keys():
            equalNum += equalNums(labelTestSet[val], featValueDict[val])
        ratioAfterDivision = equalNum / labelTest.size 
        # print("当前节点预划分标签的准确率：" + str(ratioPreDivision))
        # print("当前节点划分后的准确率：" + str(ratioAfterDivision))
        # 如果划分后的测试数据准确率低于划分前的，则划分无效，进行剪枝，即使节点等于预划分标签
        # 注意这里取的是小于，如果有需要 也可以取 小于等于
        if ratioAfterDivision < ratioPreDivision:
            newTree = featPreLabel 
    return newTree
 
 
 
 
# 将西瓜数据2.0分割为测试集和训练集
xgDataTrain, xgLabelTrain, xgDataTest, xgLabelTest = splitXgData20(xgData, xgLabel)
# 生成不剪枝的树
xgTreeTrain = createTree(xgDataTrain, xgLabelTrain, xgName, method = 'id3')
# 生成预剪枝的树
xgTreePrePruning = createTreePrePruning(xgDataTrain, xgLabelTrain, xgDataTest, xgLabelTest, xgName, method = 'id3')
# 画剪枝前的树
#print("剪枝前的树")
#createPlot(xgTreeTrain)
# 画剪枝后的树
#print("剪枝后的树")
#createPlot(xgTreePrePruning)
 
 
#将自己的数据分割为测试集和训练集
myDataTrain, myLabelTrain, myDataTest, myLabelTest = splitMyData(myData,myLabel)
# 生成不剪枝的树
myTreeTrain = createTree(myDataTrain, myLabelTrain, myName, method='id3')
# 生成预剪枝的树
myTreePrePruning = createTreePrePruning(myDataTrain, myLabelTrain, myDataTest, myLabelTest, myName, method='id3')
# 画剪枝前的树
#print("剪枝前的树")
#createPlot(myTreeTrain)
# 画剪枝后的树
#print("剪枝后的树")
#createPlot(myTreePrePruning)
 
 
# 书中的树结构 p81 p83
 
xgTreeBeforePostPruning = {"脐部": {"_vpdl": "是"
                                   , '凹陷': {'色泽':{"_vpdl": "是", '青绿': '是', '乌黑': '是', '浅白': '否'}}
                                   , '稍凹': {'根蒂':{"_vpdl": "是"
                                                  , '稍蜷': {'色泽': {"_vpdl": "是"
                                                                  , '青绿': '是'
                                                                  , '乌黑': {'纹理': {"_vpdl": "是"
                                                                               , '稍糊': '是', '清晰': '否', '模糊': '是'}}
                                                                  , '浅白': '是'}}
                                                  , '蜷缩': '否'
                                                  , '硬挺': '是'}}
                                   , '平坦': '否'}}
 
#xgTreeBeforePostPruning = createTreeWithLabel(xgDataTrain, xgLabelTrain, xgName, method='id3')
#print(xgTreeBeforePostPruning)
xgTreePostPruning = treePostPruning(xgTreeBeforePostPruning, xgDataTest, xgLabelTest, xgName)
createPlot(convertTree(xgTreeBeforePostPruning))
createPlot(xgTreePostPruning)
 
 
myTreeBeforePostPruning = createTreeWithLabel(myDataTrain, myLabelTrain, myName, method='id3')
#print(myTreeBeforePostPruning)
myTreePostPruning = treePostPruning(myTreeBeforePostPruning, myDataTest, myLabelTest, myName)
createPlot(convertTree(myTreeBeforePostPruning))
createPlot(myTreePostPruning)

FileNotFoundError: [Errno 2] No such file or directory: 'data.xlsx'

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pylab import *
import operator

# 特征字典，后面用到了好多次，干脆当全局变量了
featureDic = {
    '色泽': ['浅白', '青绿', '乌黑'],
    '根蒂': ['硬挺', '蜷缩', '稍蜷'],
    '敲声': ['沉闷', '浊响', '清脆'],
    '纹理': ['清晰', '模糊', '稍糊'],
    '脐部': ['凹陷', '平坦', '稍凹'],
    '触感': ['硬滑', '软粘']}

# ***********************画图***********************
# **********************start***********************
# 详情参见机器学习实战决策树那一章

# 定义文本框和箭头格式
decisionNode = dict(boxstyle="sawtooth", fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")
mpl.rcParams['font.sans-serif'] = ['SimHei']  # 没有这句话汉字都是口口
# mpl.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题


def plotMidText(cntrPt, parentPt, txtString):
    xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0]
    yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1]
    createPlot.ax1.text(xMid, yMid, txtString, fontsize=20)


def plotNode(nodeTxt, centerPt, parentPt, nodeType):  # 绘制带箭头的注解
    createPlot.ax1.annotate(nodeTxt,
                            xy=parentPt,
                            xycoords="axes fraction",
                            xytext=centerPt,
                            textcoords="axes fraction",
                            va="center",
                            ha="center",
                            bbox=nodeType,
                            arrowprops=arrow_args,
                            fontsize=20)


def getNumLeafs(myTree):  # 获取叶节点的数目
    numLeafs = 0
    firstStr = list(myTree.keys())[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            numLeafs += getNumLeafs(secondDict[key])
        else:
            numLeafs += 1
    return numLeafs


def getTreeDepth(myTree):  # 获取树的层数
    maxDepth = 0
    firstStr = list(myTree.keys())[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            thisDepth = 1 + getTreeDepth(secondDict[key])
        else:
            thisDepth = 1
        if thisDepth > maxDepth: maxDepth = thisDepth
    return maxDepth


def plotTree(myTree, parentPt, nodeTxt):
    numLeafs = getNumLeafs(myTree)
    getTreeDepth(myTree)
    firstStr = list(myTree.keys())[0]
    cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW,
              plotTree.yOff)
    plotMidText(cntrPt, parentPt, nodeTxt)
    plotNode(firstStr, cntrPt, parentPt, decisionNode)
    secondDict = myTree[firstStr]
    plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            plotTree(secondDict[key], cntrPt, str(key))
        else:
            plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalW
            plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff),
                     cntrPt, leafNode)
            plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
    plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD


def createPlot(inTree):
    fig = plt.figure(1, figsize=(600, 30), facecolor='white')
    fig.clf()
    axprops = dict(xticks=[], yticks=[])
    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
    plotTree.totalW = float(getNumLeafs(inTree))
    plotTree.totalD = float(getTreeDepth(inTree))
    plotTree.xOff = -0.5 / plotTree.totalW
    plotTree.yOff = 1.0
    plotTree(inTree, (0.5, 1.0), '')
    plt.show()
# ***********************画图***********************
# ***********************end************************


def getDataSet():
    """
    get watermelon data set 3.0 alpha.
    :return: 训练集合剪枝集以及特征列表。
    """
    # 也可以直接从
    dataSet = [
        ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
        ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
        ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
        ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
        ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
        ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '好瓜'],
        ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '好瓜'],
        ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '好瓜'],
        ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜'],
        ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '坏瓜'],
        ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '坏瓜'],
        ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '坏瓜'],
        ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '坏瓜'],
        ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '坏瓜'],
        ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '坏瓜'],
        ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '坏瓜'],
        ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜']
    ]

    features = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']

    # #得到特征值字典，本来用这个生成的特征字典，还是直接当全局变量方便
    # featureDic = {}
    # for i in range(len(features)):
    #     featureList = [example[i] for example in dataSet]
    #     uniqueFeature = list(set(featureList))
    #     featureDic[features[i]] = uniqueFeature

    # 每种特征的属性个数
    numList = []  # [3, 3, 3, 3, 3, 2]
    for i in range(len(features)):
        numList.append(len(featureDic[features[i]]))

    # # 编码，把文字替换成数字。用1、2、3表示同种特征的不同类型
    # newDataSet = []
    # for dataVec in dataSet:  # 第一每一个数据
    #     dataNum = dataVec[-1]  # 保存数据中类别部分
    #     newData = []
    #     for i in range(len(dataVec) - 1):  # 值为字符的每一列
    #         for j in range(numList[i]):  # 对应列的特征的每一类
    #             if dataVec[i] == featureDic[features[i]][j]:
    #                 newData.append(j + 1)
    #     newData.append(dataNum)  # 编码好的部分和原来的数值部分合并
    #     newDataSet.append(newData)

    newDataSet = np.array(dataSet)
    # 得到训练数据集
    trainIndex = [0, 1, 2, 5, 6, 9, 13, 14, 15, 16]
    trainDataSet = newDataSet[trainIndex]
    # 得到剪枝数据集
    pruneIndex = [3 ,4, 7, 8, 10, 11, 12]
    pruneDataSet = newDataSet[pruneIndex]

    return np.array(dataSet), trainDataSet, pruneDataSet, features


def calGini(dataArr):
    """
    calculate information entropy.
    :param dataArr:
    :param classArr:
    :return: Gini
    """
    numEntries = dataArr.shape[0]
    classArr = dataArr[:, -1]
    uniqueClass = list(set(classArr))
    Gini = 1.0
    for c in uniqueClass:
        Gini -= (len(dataArr[dataArr[:, -1] == c]) / float(numEntries)) ** 2
    return Gini


def splitDataSet(dataSet, ax, value):
    """
    按照给点的属性ax和其中一种取值value来划分数据。
    当属性类型为标称数据时，返回一个属性值都为value的数据集。
    input:
        dataSet: 输入数据集，形状为(m,n)表示m个数据，前n-1列个属性，最后一列为类型。
        ax：属性类型
        value: 标称型时为1、2、3等。数值型为形如0.123的数。
    return：
        标称型dataSet返回第ax个属性中值为value组成的集合
    """
    return np.delete(dataSet[dataSet[:, ax] == value], ax, axis=1)


def calSplitGin(dataSet, ax, labels):
    """
    计算给定数据dataSet在属性ax上的基尼指数。
    input：
        dataSet：输入数据集，形状为(m,n)表示m个数据，前n-1列个属性，最后一列为类型。
        labelList：属性列表，如['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']
        ax: 选择用来计算信息增益的属性。0表示第一个属性，1表示第二个属性等。
    return：
        Gini:基尼指数
    """
    newGini = 0.0  # 划分完数据后的基尼指数
    # 对每一种属性
    for j in featureDic[ax]:
        axIndex = labels.index(ax)
        subDataSet = splitDataSet(dataSet, axIndex, j)
        prob = len(subDataSet) / float(len(dataSet))
        if prob != 0:  # prob为0意味着dataSet的ax属性中，没有第j+1种值
            newGini += prob * calGini(subDataSet)
    return newGini


def chooseBestSplit(dataSet, labelList):
    """
    得到基尼指数最小的属性作为最有划分属性。
    input:
        dataSet
        labelList
    return:
        bestFeature: 使得到最大增益划分的属性。
    """
    bestGain = 1
    bestFeature = -1
    n = dataSet.shape[1]
    # 对每一个特征
    for i in range(n - 1):
        newGini = calSplitGin(dataSet, labelList[i], labelList)
        if newGini < bestGain:
            bestFeature = i
            bestGain = newGini

    return bestFeature


def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount:
            classCount[vote] = 0
        classCount[vote] += 1
    # classCount.items()将字典的key-value对变成元组对，如{'a':1, 'b':2} -> [('a',1),('b',2)]
    # operator.itemgetter(1)按照第二个元素次序进行排序
    # reverse=True表示从大大到小。[('b',2), ('a',1)]
    sortedClassCount = sorted(classCount.items(),
                              key=operator.itemgetter(1),
                              reverse=True)
    return sortedClassCount[0][0]   # 返回第0个元组的第0个值


def createTree(dataSet, labels):
    """
    通过信息增益递归创造一颗决策树。
    input:
        labels
        dataSet
    return:
        myTree: 返回一个存有树的字典
    """
    classList = dataSet[:, -1]
    # 如果基尼指数为0，即D中样本全属于同一类别，返回
    if calGini(dataSet) == 0:
        return dataSet[0][-1]
    # 属性值为空，只剩下类标签
    if len(dataSet[0]) == 1:
        return majorityCnt(classList)

    # 得到增益最大划分的属性、值
    bestFeat = chooseBestSplit(dataSet, labels)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel: {}}  # 创建字典，即树的节点。
    # 生成子树的时候要将已遍历的属性删去。数值型不要删除。
    labelsCopy = labels[:]
    del (labelsCopy[bestFeat])
    uniqueVals = featureDic[bestFeatLabel]  # 最好的特征的类别列表
    for value in uniqueVals:  # 标称型的属性值有几种，就要几个子树。
        # Python中列表作为参数类型时，是按照引用传递的，要保证同一节点的子节点能有相同的参数。
        subLabels = labelsCopy[:]  # subLabels = 注意要用[:]，不然还是引用
        subDataSet = splitDataSet(dataSet, bestFeat, value)
        if len(subDataSet) != 0:
            myTree[bestFeatLabel][value] = createTree(subDataSet, subLabels)
        else:
            # 计算D中样本最多的类
            myTree[bestFeatLabel][value] = majorityCnt(classList)

    return myTree


def classify(data, featLabels, Tree):
    """
    通过决策树对一条数据分类
    :param featLabels:
    :param data:
    :param Tree:
    :return: 分类
    """
    firstStr = list(Tree.keys())[0]  # 父节点
    secondDict = Tree[firstStr]  # 父节点下的子树，即子字典
    featIndex = featLabels.index(firstStr)  # 当前属性标识的位置
    classLabel = ""
    for key in secondDict.keys():  # 遍历该属性下的不同类
        if data[featIndex] == key:  # 如果数据中找到了匹配的属性类别
            # 如果不是叶子节点，继续向下遍历
            if type(secondDict[key]).__name__ == 'dict':
                classLabel = classify(data, featLabels, secondDict[key])
            # 如果是叶子节点，返回该叶子节点的类型
            else:
                classLabel = secondDict[key]
    return classLabel


def calAccuracy(dataSet, labels, Tree):
    """
    计算已有决策树的精度
    :param dataSet:
    :param labels: ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']
    :param Tree:
    :return: 决策树精度
    """
    cntCorrect = 0
    size = len(dataSet)
    for i in range(size):
        pre = classify(dataSet[i], labels, Tree)
        if pre == dataSet[i][-1]:
            cntCorrect += 1
    return cntCorrect / float(size)


def cntAccNums(dataSet, pruneSet):
    """
    用于剪枝，用dataSet中多数的类作为节点类，计算pruneSet中有多少类是被分类正确的，然后返回正确
    分类的数目。
    :param dataSet: 训练集
    :param pruneSet: 测试集
    :return: 正确分类的数目
    """
    nodeClass = majorityCnt(dataSet[:, -1])
    rightCnt = 0
    for vect in pruneSet:
        if vect[-1] == nodeClass:
            rightCnt += 1
    return rightCnt


def prePruning(dataSet, pruneSet, labels):
    """
    每到一个节点要划分的时候：
    1. 用这个节点上数据投票得出这个节点的类，即是"好瓜"还是"坏瓜"。
    2. 用这个投票出来的类计算测试集中正确的点数。
    3. 尝试计算一个节点向下划分时测试点的正确数。假如，当前属性为"脐部"，有三种"凹陷",
    "稍凹","平坦"，则可将训练集和测试集按照这三种属性值分为三部分，分别计算分类正确的点数并求和。
    4 若尝试划分得到的正确点数少于不划分时得到的正确点数，则返回不划分时节点的类，否则继续划分。
    :param dataSet: 训练数据集
    :param pruneSet: 预剪枝数据集
    :param labels:  属性标签
    :return:
    """
    classList = dataSet[:, -1]

    if calGini(dataSet) == 0:
        return dataSet[0][-1]

    if len(dataSet[0]) == 1:
        return majorityCnt(classList)

    # 获取最好特征
    bestFeat = chooseBestSplit(dataSet, labels)
    bestFeatLabel = labels[bestFeat]
    # 计算初始正确率
    baseRightNums = cntAccNums(dataSet, pruneSet)
    # 得到最好划分属性取值
    features = featureDic[bestFeatLabel]
    # 计算尝试划分节点时的正确率
    splitRightNums = 0.0
    for value in features:
        # 每个属性取值得到的子集
        subDataSet = splitDataSet(dataSet, bestFeat, value)
        if len(subDataSet) != 0:
            # 把用来剪枝的子集也按照相应属性值划分下去
            subPruneSet = splitDataSet(pruneSet, bestFeat, value)
            splitRightNums += cntAccNums(subDataSet, subPruneSet)
            print(splitRightNums)
    if baseRightNums < splitRightNums:  # 如果不划分的正确点数少于尝试划分的点数，则继续划分。
        myTree = {bestFeatLabel: {}}
    else:
        return majorityCnt(dataSet[:, -1])  # 否则，返回不划分时投票得到的类

    # 以下代码和不预剪枝的代码大致相同，一点不同在于每次测试集也要参与划分。
    for value in features:
        subLabels = labels[:]
        subDataSet = splitDataSet(dataSet, bestFeat, value)
        subPruneSet = splitDataSet(pruneSet, bestFeat, value)
        if len(subDataSet) != 0:
            myTree[bestFeatLabel][value] = prePruning(subDataSet, subPruneSet, subLabels)
        else:
            # 计算D中样本最多的类
            myTree[bestFeatLabel][value] = majorityCnt(classList)
    return myTree


def postPruning(dataSet, pruneSet, labels):
    """
    后剪枝的思想就是，在决策树每一条分支到达叶子节点时，分别计算剪枝和不剪枝时，位于该节点上的
    测试数据，被正确判定的数量孰大孰小，以此为依据来决定是否剪枝。
    :param dataSet:
    :param pruneSet:
    :param labels:
    :return:
    """
    classList = dataSet[:, -1]
    # 如果基尼指数为0，即D中样本全属于同一类别，返回
    if calGini(dataSet) == 0:
        return dataSet[0][-1]
    # 属性值为空，只剩下类标签
    if len(dataSet[0]) == 1:
        return majorityCnt(classList)

    # 得到增益最大划分的属性、值
    bestFeat = chooseBestSplit(dataSet, labels)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel: {}}  # 创建字典，即树的节点。
    # 生成子树的时候要将已遍历的属性删去。数值型不要删除。
    labelsCopy = labels[:]
    del (labelsCopy[bestFeat])
    uniqueVals = featureDic[bestFeatLabel]  # 最好的特征的类别列表
    for value in uniqueVals:  # 标称型的属性值有几种，就要几个子树。
        # Python中列表作为参数类型时，是按照引用传递的，要保证同一节点的子节点能有相同的参数。
        subLabels = labelsCopy[:]  # subLabels = 注意要用[:]，不然还是引用
        subPrune = splitDataSet(pruneSet, bestFeat, value)
        subDataSet = splitDataSet(dataSet, bestFeat, value)
        if len(subDataSet) != 0:
            myTree[bestFeatLabel][value] = postPruning(subDataSet, subPrune, subLabels)
        else:
            # 计算D中样本最多的类
            myTree[bestFeatLabel][value] = majorityCnt(classList)

    # 后剪枝，如果到达叶子节点，尝试剪枝。
    # 计算未剪枝时，测试集的正确数
    numNoPrune = 0.0
    for value in uniqueVals:
        subDataSet = splitDataSet(dataSet, bestFeat, value)
        if len(subDataSet) != 0:
            subPrune = splitDataSet(pruneSet, bestFeat, value)
            numNoPrune += cntAccNums(subDataSet, subPrune)
    # 计算剪枝后，测试集正确数
    numPrune = cntAccNums(dataSet, pruneSet)
    # 比较决定是否剪枝, 如果剪枝后该节点上测试集的正确数变多了，则剪枝。
    if numNoPrune < numPrune:
        return majorityCnt(dataSet[:, -1])  # 直接返回节点上训练数据的多数类为节点类。

    return myTree


def main():
    dataSet, trainData, pruneData, labelList = getDataSet()
    # 用训练集训练一颗树并画图
    myTree = createTree(trainData, labelList)
    print(myTree)
    # createPlot(myTree)
    # 画预剪枝树
    preTree = prePruning(trainData, pruneData, labelList)
    # createPlot(preTree)
    # 画后剪枝树
    postPTree = postPruning(trainData, pruneData, labelList)
    print(preTree)
    # createPlot(postPTree)
    # 计算未剪枝的精度
    print(f"full tree's train accuracy = {calAccuracy(trainData, labelList, myTree)},"
          f"test accuracy = {calAccuracy(pruneData, labelList, myTree)}\n")
    # 计算预剪枝精度
    print(f"pre pruning tree's train accuracy = {calAccuracy(trainData, labelList, myTree)},"
          f"test accuracy = {calAccuracy(pruneData, labelList, preTree)}\n")
    # 计算后剪枝精度
    print(f"post pruning tree's train accuracy = {calAccuracy(trainData, labelList, myTree)},"
          f"test accuracy = {calAccuracy(pruneData, labelList, postPTree)}\n")

if __name__ == '__main__':
    main()



{'色泽': {'浅白': '坏瓜', '青绿': {'敲声': {'沉闷': '坏瓜', '浊响': '好瓜', '清脆': '坏瓜'}}, '乌黑': {'根蒂': {'硬挺': '好瓜', '蜷缩': '好瓜', '稍蜷': {'纹理': {'清晰': '坏瓜', '模糊': '好瓜', '稍糊': '好瓜'}}}}}}
base accuracy = 0.42857142857142855
base accuracy = 0.5
base accuracy = 0.5
{'色泽': {'浅白': '坏瓜', '青绿': '好瓜', '乌黑': '好瓜'}}
full tree's train accuracy = 1.0,test accuracy = 0.2857142857142857

pre pruning tree's train accuracy = 1.0,test accuracy = 0.5714285714285714

post pruning tree's train accuracy = 1.0,test accuracy = 0.5714285714285714

