In [14]:
from matplotlib.font_manager import FontProperties
import matplotlib.pyplot as plt
from math import log
import operator
import pickle

'''计算经验熵,反应随机变量的不确定性'''
def calEnt(dataset): 
    numEntries = len(dataset)
    labelCounts = {}
    for featVec in dataset:  #为所有可能的类别标签创建字典
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys():   #将没有出现的标签放进字典统计
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    Ent = 0.0  # 经验熵
    for key in labelCounts:
        prob = float(labelCounts[key]) / numEntries  #该标签出现的概率
        Ent -= prob * log(prob,2)  
    return Ent

'''创建数据集'''
def creatDataSet():
    dataSet = [ [0, 0, 0, 0, 'no'], 
                [0, 0, 0, 1, 'no'],
                [0, 1, 0, 1, 'yes'],
                [0, 1, 1, 0, 'yes'],
                [0, 0, 0, 0, 'no'],
                [1, 0, 0, 0, 'no'],
                [1, 0, 0, 1, 'no'],
                [1, 1, 1, 1, 'yes'],
                [1, 0, 1, 2, 'yes'],
                [1, 0, 1, 2, 'yes'],
                [2, 0, 1, 2, 'yes'],
                [2, 0, 1, 1, 'yes'],
                [2, 1, 0, 1, 'yes'],
                [2, 1, 0, 2, 'yes'],
                [2, 0, 0, 0, 'no']]
    labels = ['年龄', '是否有工作', '是否有自己的房子', '信贷情况']  #特征标签
    return dataSet, labels

'''依据某个特征对数据进行分类'''
def splitDataSet(dataset, axis, value):  #axis-划分数据集的特征,value-需要返回特征的值
    retDataSet = []  #返回的数据列表
    for featVec in dataset:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:]) #将符合条件的特征添加到待返回数据集当中
            retDataSet.append(reducedFeatVec)
    return retDataSet

'''依据信息增益准则选择最优特征'''
def selectBestFeature(dataset):
    numFeatures = len(dataset[0]) - 1  #特征数量
    baseEnt = calEnt(dataset)  #经验熵
    bestInfoGain = 0.0  #信息增益
    bestFeatureIndex = -1  #最优特征索引值
    dataset_len = float(len(dataset))  #数据长度
    for i in range(numFeatures):
        featList = [example[i] for example in dataset]  #获取数据集第i个特征所有值,但是会有大量重复值,需要去重
        uniqueFeat = set(featList) 
        conEnt = 0.0  #经验条件熵
        for value in uniqueFeat:  #计算信息增益
            subDataSet = splitDataSet(dataset, i, value)
            prob = len(subDataSet) / dataset_len  #计算子集的概率
            conEnt += prob * calEnt(subDataSet)  #经验条件熵
            infoGain = baseEnt - conEnt  #信息增益=熵-条件熵
            #更新信息增益最大值
            if(infoGain>bestInfoGain):
                bestInfoGain = infoGain
                bestFeatureIndex = i
    return bestFeatureIndex
    

'''统计classList中出现最多元素的标签'''
def mostCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote] = 0  #如果类别不在字典就添加进去
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse = True)  #字典值降许排序
    return sortedClassCount[0][0]  #返回出现次数最多的元素


'''创建决策树'''
def createTree(dataset, labels, featLabels):
    classList = [example[-1] for example in dataset]  #取出类别标签
    if classList.count(classList[0]) == len(classList):  #若类别完全相同则停止划分
        return classList[0]
    if len(dataset[0]) == 1 or len(labels) == 0:
        return mostCnt(classList)
    bestFeat = selectBestFeature(dataset)  #选择最优的特征
    bestFeatLabel = labels[bestFeat]  #最优特征标签
    featLabels.append(bestFeatLabel)
    myTree = {bestFeatLabel:{}}  #根据最优特征标签生成树,根节点
    del(labels[bestFeat])  #删除使用过得类别标签
    featValues = [example[bestFeat] for example in dataset]  #取出训练集中所有最优特征的属性值
    uniqueValues = set(featValues)  #去重
    for value in uniqueValues:
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataset, bestFeat, value), labels, featLabels)
    return myTree

'''获取决策树叶子节点数目'''
def getNumLeafs(myTree):
    numLeafs = 0
    firstStr = next(iter(myTree))
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict': #测试该节点是否为字典,如果不是字典,代表为叶子节点
            numLeafs += getNumLeafs(secondDict[key])
        else:
            numLeafs += 1
    return numLeafs

'''获取决策树的层数'''
def getTreeDepth(myTree):
    maxDepth = 0
    firstStr = next(iter(myTree))
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            thisDepth = getTreeDepth(secondDict[key]) + 1
        else:
            thisDepth = 1
        if thisDepth > maxDepth:  #更新最大层数
            maxDepth = thisDepth
    return maxDepth

'''使用决策树分类'''
def classify(inputTree, featLabels, testVec):
    firstStr = next(iter(inputTree))
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]).__name__ == 'dict':
                classLabel = classify(secondDict[key], featLabels, testVec)
            else:
                classLabel = secondDict[key]
    return classLabel

'''存储决策树'''
def storeTree(inputTree, filename):
    with open(filename, 'wb') as fw:
        pickle.dump(inputTree, fw)
        
'''读取决策树'''
def grabTree(filename):
    fr = open(filename, 'rb')
    return pickle.load(fr)   


if __name__ == '__main__':
    dataset,labels = creatDataSet()
    featLabels = []
    myTree = createTree(dataset, labels, featLabels)
    testVec = [0,0]
    result = classify(myTree, featLabels, testVec)
    if result == 'yes':
        print('放贷')
    else:
        print('不放贷')


不放贷
