In [2]:
import numpy as np
import pandas as pd
from collections import Counter

In [3]:
data = pd.read_csv('example_data.csv')
data

Unnamed: 0,humility,outlook,temp,windy,play
0,high,sunny,hot,no,no
1,high,sunny,hot,yes,no
2,high,overcast,hot,no,yes
3,high,rainy,mild,no,yes
4,normal,rainy,cool,no,yes
5,normal,rainy,cool,yes,no
6,normal,overcast,cool,yes,yes
7,high,sunny,mild,no,no
8,normal,sunny,cool,no,yes
9,normal,rainy,mild,no,yes


In [4]:
def createDataset():
    """
    :return: dataSet是二维列表，其中第三列为分类标签 features是特征列表
    """
    df = pd.read_csv('example_data.csv')
    dataSet = df.values.tolist()
    features = df.columns.tolist()
    return dataSet, features

In [5]:
dataset, features = createDataset()

In [48]:
dataset

[['high', 'sunny', 'hot', 'no', 'no'],
 ['high', 'sunny', 'hot', 'yes', 'no'],
 ['high', 'overcast', 'hot', 'no', 'yes'],
 ['high', 'rainy', 'mild', 'no', 'yes'],
 ['normal', 'rainy', 'cool', 'no', 'yes'],
 ['normal', 'rainy', 'cool', 'yes', 'no'],
 ['normal', 'overcast', 'cool', 'yes', 'yes'],
 ['high', 'sunny', 'mild', 'no', 'no'],
 ['normal', 'sunny', 'cool', 'no', 'yes'],
 ['normal', 'rainy', 'mild', 'no', 'yes'],
 ['normal', 'sunny', 'mild', 'yes', 'yes'],
 ['high', 'overcast', 'mild', 'yes', 'yes'],
 ['normal', 'overcast', 'hot', 'no', 'yes'],
 ['high', 'rainy', 'mild', 'yes', 'no']]

In [49]:
features

['humility', 'outlook', 'temp', 'windy', 'play']

In [50]:
np.unique(dataset)

array(['cool', 'high', 'hot', 'mild', 'no', 'normal', 'overcast', 'rainy',
       'sunny', 'yes'], dtype='<U8')

In [6]:
def calEntropy(dataset):
    nums = len(dataset)  # 数据总条数
    labels = {}     # 统计数据中各类别数目
    for featureVec in dataset:
        # 每行数据最后一个值为标签
        currentLabel = featureVec[-1]
        labels[currentLabel] = labels.get(currentLabel, 0) + 1
    entropy = 0.0
    for label in labels:
        prob = labels[label] / nums
        entropy -= prob * np.log2(prob)
    return entropy

In [52]:
calEntropy(dataset)

0.9402859586706311

In [7]:
def splitDataset(dataset, col, value):
    split_result = []
    for featureVec in dataset:
        if featureVec[col] == value:
            subDataset1 = featureVec[:col] + featureVec[col + 1:]
            split_result.append(subDataset1)
    return split_result

In [54]:
splitDataset(dataset, 0, 'high')

[['sunny', 'hot', 'no', 'no'],
 ['sunny', 'hot', 'yes', 'no'],
 ['overcast', 'hot', 'no', 'yes'],
 ['rainy', 'mild', 'no', 'yes'],
 ['sunny', 'mild', 'no', 'no'],
 ['overcast', 'mild', 'yes', 'yes'],
 ['rainy', 'mild', 'yes', 'no']]

In [8]:
def chooseBestFeature(dataset):
    numFeatures = len(dataset[0]) - 1   # 特征数量
    baseEntropy = calEntropy(dataset)   # 信息熵
    bestEntropyGain, bestFeature = 0, -1
    for i in range(numFeatures):
        # 当前特征列下的所有值
        featureValues = [featureVec[i] for featureVec in dataset]
        # print(featureValues)
        # 特征值类别
        featureValueUniques = list(set(featureValues))
        newEntropy = 0.0
        splitInfo = 0.0
        for featureValue in featureValueUniques:
            split_result = splitDataset(dataset, i, featureValue)
            # 求出该值在第i列中出现概率
            prob = len(split_result) / len(dataset)
            # 求第i列特征各值对应的熵之和
            newEntropy += prob * calEntropy(split_result)
            splitInfo = -prob * np.log2(prob)
        # 求出第i列特征的信息增益率
        infoGain = (baseEntropy - newEntropy) / splitInfo
        # 贪心算法获得最大信息增益率对应的特征
        if infoGain > bestEntropyGain:
            bestEntropyGain, bestFeature = infoGain, i
    return bestFeature

In [9]:
def createDecisionTree(dataset, features):
    # 1.如果数据集中所有数据属于同一类
    labels = [featureVec[-1] for featureVec in dataset]
    if len(set(labels)) == 1:
        return labels[0]
    # 2.如果数据集的特征列为空，即只有标签列，则根据投票法返回数目最多的类别
    if len(dataset[0]) == 1:
        return majority(labels)
    # 3.否则，就计算每个特征的信息增益，选出最优特征
    bestFeature = chooseBestFeature(dataset)    # 这是下标
    bestFeatureValue = features[bestFeature]    # 这是最优特征
    # 以最优特征为根节点创建树
    myTree = {bestFeatureValue: {}}
    # 删除掉最优特征
    del features[bestFeature]
    # 找出该特征所有训练数据的值
    featureValues = [featureVec[bestFeature] for featureVec in dataset]
    featureUniqueValues = list(set(featureValues))
    #根据该属性的值求树的各个分支
    for featureVal in featureUniqueValues:
        subFeatures = features[:]
        myTree[bestFeatureValue][featureVal] = createDecisionTree(splitDataset(dataset, bestFeature, featureVal), subFeatures)

    return myTree

In [10]:
def majority(labels):
    return Counter(labels).most_common()[0][0]




In [11]:
createDecisionTree(dataset, features)

{'outlook': {'sunny': {'humility': {'normal': 'yes', 'high': 'no'}},
  'overcast': 'yes',
  'rainy': {'windy': {'no': 'yes', 'yes': 'no'}}}}