In [1]:
import csv
import numpy as np

def readCsv(filepath):
    D = []
    with open(filepath, encoding='utf-8') as f:
        reader = csv.reader(f)
        for row in reader:
            D.append(row)
        f.close()
    return D

def extractFeatures(D):
    '''从数据集中提取每个特征（包括分类）的不同取值'''
    features = dict()
    for i in range(0, len(D[0])):
        tmp = []
        for j in range(0, len(D)):
            if D[j][i] not in tmp:
                tmp.append(D[j][i])
        if i != len(D[0]) - 1:
            features['A' + str(i+1)] = tmp
        else:
            features['C'] = tmp
    return features

def calFeaturePa(featureId, featureVal, D):
    '''统计每个特征不同取值的个数（如果除以总个数的话，就是分布了）'''
    pa = dict()
    for name in featureVal:
        num = 0
        for i in range(0, len(D)):
            if D[i][featureId] == name:
                num += 1
        pa[name] = num
    return pa

def entropy(pa):
    '''计算概率分布的熵， pa为概率分布'''
    sum = 0.0
    for p in pa:
        if p != 0:
            sum += p * np.log2(p)
        else:
            sum += 0
    return -1 * sum

def conditionEntropy(featureName, featureId, Features, D):
    '''计算条件熵'''
    Y = Features['C']
    pa_feature = calFeaturePa(featureId=featureId, featureVal=Features[featureName], D=D)
    sum = 0.0
    for x_i in Features[featureName]:
        H_Y_xi = []
        for y_i in Y:
            tmp = 0
            for i in range(0, len(D)):
                if D[i][featureId] == x_i and D[i][len(D[0])-1] == y_i:
                    tmp += 1
            H_Y_xi.append(tmp / pa_feature[x_i])
        sum += (pa_feature[x_i] / len(D)) * entropy(H_Y_xi)
    return sum

def gainInformation(D, featureName, featureId, Features, entropY):
    """计算信息增益"""
    ConditionEntropy = conditionEntropy(featureName=featureName, featureId=featureId, Features=Features, D=D)
    gi = entropY - ConditionEntropy
    return gi

def gainInformationRatio(D, featureName, featureId, Features, entropY):
    """计算信息增益比"""
    ConditionEntropy = conditionEntropy(featureName=featureName, featureId=featureId, Features=Features, D=D)
    gir = (entropY - ConditionEntropy) / ConditionEntropy
    return gir

def selectBestFeatures(D, Features):
    """从当前数据集D和特征集A中根据信息增益选择最优特征"""
    criterion = dict()
    featureId = 0
    entropY = entropy([val / len(D) for val in calFeaturePa(featureId=len(D[0])-1, featureVal=Features['C'], D=D).values()])
    bestName = ''
    bestVal = 0
    for name in Features.keys():
        if name != 'C':
            criterion[name] = gainInformation(D, name, featureId, Features, entropY)
            featureId += 1
            if bestVal < criterion[name]:
                bestVal = criterion[name]
                bestName = name
    return bestName, bestVal

def divDataSet(D, Feature, bestName):
    featureId = 0
    for key in Feature.keys():
        if key == bestName:
            break
        featureId += 1
    # div features
    divD = dict()
    for key in Feature[bestName]:
        tmp = []
        for i in range(0, len(D)):
            if D[i][featureId] == key:
                D[i].pop(featureId)
                tmp.append(D[i])
        divD[key] = tmp
    Feature.pop(bestName)
    return divD, Feature

def sampleClass(D):
    clc = dict()
    cid = len(D[0]) - 1
    for i in range(0, len(D)):
        if D[i][cid] not in clc:
            clc[D[i][cid]] = 1
        else:
            clc[D[i][cid]] += 1
    return clc

def sameClass(D):
    clc = sampleClass(D)
    if len(clc.keys()) == 1:
        return True
    return False

def label(D):
    clc = sampleClass(D)
    maxn = ''
    maxv = 0
    for key, value in clc.items():
        if maxv < value:
            maxv = value
            maxn = key
    return maxn, maxv

def showTree(parent, tree):
    if tree == None:
        return
    print(parent, ' -- ', tree.relation, '  --', tree.value)
    for i in range(0, len(tree.children)):
        showTree(tree.value, tree.children[i])

def MakeGraph(parent, tree, g, idx):
    if tree == None:
        return
    if idx > 1:
        if 'A' not in tree.value:
            g.node(name=parent+" "+tree.value)
            g.edge(parent, parent+" "+tree.value, tree.relation)
        else:
            g.node(name=tree.value)
            g.edge(parent, tree.value, tree.relation)
    for i in range(0, len(tree.children)):
        MakeGraph(tree.value, tree.children[i], g, idx + 1)

In [2]:
class Decision:
    def __init__(self, value, relation):
        self.value = value
        self.children = []
        self.relation = relation

In [3]:
def ID3Tree(D, Feature, relation):
    if sameClass(D) or len(Feature) == 1:
        name, value = label(D)
        return Decision(value=name, relation=relation)

    bestName, bestValue = selectBestFeatures(D, Feature)
    node = Decision(bestName, relation)
    divd, divf = divDataSet(D, Feature, bestName)
    Keys = [key for key in divd.keys()]
    for i in range(0, len(Keys)):
        node.children.append(ID3Tree(divd[Keys[i]], divf, node.value))
    return node


In [4]:
D = readCsv('G:/LiangHao/Master/Learning/MachineLearning/test.csv')
A = extractFeatures(D)
root = ID3Tree(D, A, None)

In [5]:
showTree(None, root)

None  --  None   -- A3
A3  --  A3   -- A2
A2  --  A2   -- no
A2  --  A2   -- yes
A3  --  A3   -- yes


In [7]:
from graphviz import Digraph
# 创建对象
g = Digraph('decisionTree')
MakeGraph('start', root, g, 1)
g.view()

'decisionTree.gv.pdf'

![](http://qiniu.lianghao.work/markdown/20220505095040.png)