In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter
import math
from math import log
import pprint
import seaborn as sns
import warnings; warnings.filterwarnings(action='once')

large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
          'legend.fontsize': med,
          'figure.figsize': (16, 10),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large}
plt.rcParams.update(params)
plt.style.use('seaborn-whitegrid')
sns.set_style("white")
%matplotlib inline


In [3]:
# 书上题目5.1
def create_data():
    datasets = [['青年', '否', '否', '一般', '否'],
               ['青年', '否', '否', '好', '否'],
               ['青年', '是', '否', '好', '是'],
               ['青年', '是', '是', '一般', '是'],
               ['青年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '好', '否'],
               ['中年', '是', '是', '好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '好', '是'],
               ['老年', '是', '否', '好', '是'],
               ['老年', '是', '否', '非常好', '是'],
               ['老年', '否', '否', '一般', '否'],
               ]
    labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']
    # 返回数据集和每个维度的名称
    return datasets, labels

In [4]:
datasets, labels = create_data()
df = pd.DataFrame(data=datasets, columns=labels)
print(df.head())

   年龄 有工作 有自己的房子 信贷情况 类别
0  青年   否      否   一般  否
1  青年   否      否    好  否
2  青年   是      否    好  是
3  青年   是      是   一般  是
4  青年   否      否   一般  否


In [5]:
# 对训练集中的数据进行处理
df = df.replace({'年龄': {'青年': 1, '中年': 2, '老年': 3}, '有工作': {'是': 1, '否': 0}, 
                 '有自己的房子': {'是': 1, '否': 0}, '信贷情况': {'一般': '1', '好': 2, '非常好': '3'},
                 '类别': {'是': 1, '否': 0}})
print(df.head())

   年龄  有工作  有自己的房子 信贷情况  类别
0   1    0       0    1   0
1   1    0       0    2   0
2   1    1       0    2   1
3   1    1       1    1   1
4   1    0       0    1   0


In [13]:
# 计算墒(整个数据集维度)
def calc_entropy(datasets):
    m = len(datasets)
    label_count = {}
    for i in range(m):
        label = datasets[i][-1]
        if label not in label_count:
            label_count[label] = 0
        label_count[label] += 1
    calc_entropy = -sum([p/m*log(p/m, math.e) for p in label_count.values()])
    print('calc_entropy:', calc_entropy)
    return calc_entropy
calc_entropy(datasets)

calc_entropy: 0.6730116670092565


0.6730116670092565

In [14]:
# 经验条件熵
def condition_entropy(datasets, axis=0):
    m = len(datasets)
    feature_sets = {}
    for i in range(m):
        feature = datasets[i][axis]
        if feature not in feature_sets:
            feature_sets[feature] = []
        feature_sets[feature].append(datasets[i])
    condition_entropy = sum([len(p) / m * calc_entropy(p) for p in feature_sets.values()])
    print('condition_entropy:', condition_entropy)
    return condition_entropy
condition_entropy(datasets)

calc_entropy: 0.6730116670092565
calc_entropy: 0.6730116670092565
calc_entropy: 0.5004024235381879
condition_entropy: 0.6154752525189002


0.6154752525189002

In [15]:
 # 信息增益
def info_gain(entropy, condition_ent):
    return entropy - condition_ent

def compute_bestfeature(df):
    features = df.columns
    data = df.values
    feature_gains = {}
    entropy = calc_entropy(data)
    for i in range(len(features)-1):
        condition_ent = condition_entropy(data, axis=i)
        feature_gain = info_gain(entropy, condition_ent)
        feature_gains[features[i]] = feature_gain
        print('特征({}) - info_gain - {:.3f}'.format(features[i], feature_gain))
    best_feature = max(feature_gains.items(), key=lambda item: item[1])
    print('特征({})的信息增益最大，选择为根节点特征'.format(best_feature[0]))
    
compute_bestfeature(df)

calc_entropy: 0.6730116670092565
calc_entropy: 0.6730116670092565
calc_entropy: 0.6730116670092565
calc_entropy: 0.5004024235381879
condition_entropy: 0.6154752525189002
特征(年龄) - info_gain - 0.058
calc_entropy: 0.6730116670092565
calc_entropy: -0.0
condition_entropy: 0.4486744446728377
特征(有工作) - info_gain - 0.224
calc_entropy: 0.6365141682948128
calc_entropy: -0.0
condition_entropy: 0.38190850097688767
特征(有自己的房子) - info_gain - 0.291
calc_entropy: 0.5004024235381879
calc_entropy: 0.6365141682948128
calc_entropy: -0.0
condition_entropy: 0.4214064751639877
特征(信贷情况) - info_gain - 0.252
特征(有自己的房子)的信息增益最大，选择为根节点特征


In [16]:
# 利用ID3算法生成决策树，例5.3
class TreeNode:
    def __init__(self, root=True, label=None, feature_name=None, feature=None):
        self.root = root
        self.label = label
        self.feature_name = feature_name
        self.feature = feature
        self.tree = {}
        self.result = {'label:': self.label, 'feature': self.feature, 'tree': self.tree}
    
    # 表示对象的可打印字符串    
    def __repr__(self):
        return '{}'.format(self.result)
    
    def addNode(self, val, node):
        self.tree[val] = node


class DTree:
    def __init__(self, epsilon=0.1):
        self.epsilon = epsilon
        self._tree = {}
        
    # 计算墒(整个数据集维度)
    def calc_entropy(datasets):
        m = len(datasets)
        label_count = {}
        for i in range(m):
            label = datasets[i][-1]
            if label not in label_count:
                label_count[label] = 0
            label_count[label] += 1
        calc_entropy = -sum([p/m*log(p/m, math.e) for p in label_count.values()])
        return calc_entropy
    
    # 经验条件熵
    def condition_entropy(datasets, axis=0):
        m = len(datasets)
        feature_sets = {}
        for i in range(m):
            feature = datasets[i][axis]
            if feature not in feature_sets:
                feature_sets[feature] = []
            feature_sets[feature].append(datasets[i])
        condition_entropy = sum([len(p) / m * calc_entropy(p) for p in feature_sets.values()])
        return condition_entropy
    
    

SyntaxError: unexpected EOF while parsing (<ipython-input-16-b882f89adc1b>, line 4)