In [1]:
import math
import numpy as np
import pandas as pd
import operator


In [2]:
def testData():
    dataset = {
            "F1": [1,1,1,0,0],
            "F2":[1,1,0,0,1],
            "Y":["yes","yes","no","no","no"]
            }

    dataset = pd.DataFrame(dataset)
    return dataset

In [3]:
dataset = testData()
dataset

Unnamed: 0,F1,F2,Y
0,1,1,yes
1,1,1,yes
2,1,0,no
3,0,0,no
4,0,1,no


In [4]:
def calculate_Ent(dataset):
    """
    求当前数据集Y的信息熵
    永远求的是Y|(条件)的熵
    """
    totalNum = len(dataset)
    labelCounts = {}
    for line in dataset:
        # 记录不同的类型（Y的种类）
        currentLabel = line[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
            
        labelCounts[currentLabel] += 1
        entropy = 0.0
        for key in labelCounts:
            prob = float(labelCounts[key])/totalNum
            entropy -= prob * math.log(prob,2)
    
    return entropy

In [5]:
def splitdata(dataset, index, value):
    """
    求含有当前特征(index(str))取值(value)的数据集子集
    为Y|X_index == value的分布情况
    输入的dataset为pd.DataFrame类型

    """
    totalNum = len(dataset)
    line_label = []
    # dataset.drop(data)
    for i in range(totalNum):
        if dataset.iloc[i,:][index] == value:
            line_label.append(i)
    
    splitdataset = dataset.drop(index = line_label)
    splitdataset = splitdataset.drop(columns = index)
    
    return splitdataset

In [6]:
splitdata(dataset,"F1",1)

Unnamed: 0,F2,Y
3,0,no
4,1,no


In [7]:
def choosebestfeature(dataset):
    numOfFeature = len(dataset.columns)-1
    baseEnt = calculate_Ent(dataset)
    maxInformationGain = 0.0
    bestFeatureIndex = -1
    
    for i in range(numOfFeature):
        index = dataset.columns[i] # index记录的是特征名称(str)
        featurelist = [] # 记录当前feature的所有取值
        for j in range(len(dataset.index)):
            line = dataset.iloc[j,:].values
            if line[i] not in featurelist:
                featurelist.append(line[i])
        
            newEnt = 0.0
            
        for feature in featurelist:
            # Σ(P(X_index == feature) * H(Y|X_index == feature))
            # P(X_index == feature) : prob
            # H(Y|X_index == feature)): calculateEntropy(splitdataset)
            
            splitdataset = splitdata(dataset, index, feature)    
            prob = len(splitdataset)/len(dataset)
            newEnt += calculate_Ent(splitdataset) * prob
        
        informationGain = baseEnt-newEnt
        if informationGain > maxInformationGain:
            maxInformationGain = informationGain
            bestFeatureIndex=index
    
    return bestFeatureIndex

In [8]:
choosebestfeature(dataset)

'F1'

In [11]:
def voteresult(classlist):
    classcount = {}
    for value in classlist:
        if value not in classcount:
            classCounts[value] = 0
        classCounts[value] += 1
        
    sortClassCounts=sorted(classCounts.iteritems(),
                           key=operator.itemgetter(1),
                           reverse=True)
    return sortClassCounts[0][0]        
        

In [27]:
def CreateDecisionTree(dataset):
    classlist = [i for i in dataset["Y"]]
    # 如果D中样本全属于同一类别(只有一个Y)
    if len(set(classlist)) ==  1: # set(***)会去除重复的量，
        return classlist[0]
    # 如果,所有属性都用过了，即只剩有Y
    if len(dataset.loc[0,:].values) == 1:
        return voteresult(classlist)
    # 选取最优划分特征
    bestFeatureIndex=choosebestfeature(dataset)
    # 设立特征树
    DecisionTree={bestFeatureIndex:{}}
    
    Featurelist = []
    #求当前特征下，所有可能值的集合
    for i in range(len(dataset)):
        Featurelist.append(dataset.iloc[i,:][bestFeatureIndex])
    uniqueFeatureList = set(Featurelist)
    
    for value in uniqueFeatureList:
        splitdataset=splitdata(dataset,bestFeatureIndex,value)
        DecisionTree[bestFeatureIndex][value] = CreateDecisionTree(splitdataset)
    
    return DecisionTree

In [28]:
CreateDecisionTree(dataset)

{'F1': {0: {'F2': {0: 'yes', 1: 'no'}}, 1: 'no'}}

In [18]:
dataset.iloc[1,:]["F2"]

1