In [1]:
# coding:utf-8

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets

import common

%matplotlib inline
sns.set_style({'font.sans-serif':['simhei','Arial']})
sns.set_style('white')

## 决策树实现

In [2]:
# 熵(entropy): H(D)
def calc_entropy(D):
    p = pd.Series(D).value_counts().values / len(D)
    entropy = (-p * np.log2(p)).sum()
    return entropy

# 条件熵(condition entropy): H(D|A)
def calc_cond_entropy(D, A): # D:训练数据集; A:特征
    data_df = pd.DataFrame({'D': D, 'A': A})
    # 计算H(D_i)
    H_D_i = data_df.groupby('A')['D'].apply(calc_entropy)
    # 计算p(D_i)
    p_D_i = data_df.groupby('A')['A'].count() / len(D)
    return (H_D_i * p_D_i).sum()
    
# 信息增益(information gain): g(D,A)
def calc_info_gain(D, A):
    return calc_entropy(D) - calc_cond_entropy(D, A)

# 信息增益比(information gain ratio): g_R(D,A)
def calc_info_gain_ratio(D, A):
    # 计算H_A(D) D关于A的值的熵
    H_A_D = calc_entropy(A)
    return calc_info_gain(D, A) / H_A_D
    
class DecisionTree(object):
    def __init__(self, model_type='ID3'):
        self.model_type = model_type
    
    def _get_best_feature_index(self, features, labels, split_func):
        return features.apply(lambda A: split_func(labels, A), axis=0).argmax()
    
    def _majority_class(self, labels):
        return pd.Series(labels).value_counts().argmax()
    
    def _get_child_tree(self, data_df, best_feature, split_func):
        columns = data_df.columns[data_df.columns != best_feature]
        child_tree = data_df.groupby(best_feature)[columns].apply(lambda df: self._create_tree(df, split_func))
        return child_tree
    
    def _create_tree(self, data_df, split_func):
        # 只有一种分类 结束分叉
        if len(data_df['y'].unique()) == 1:
            return data_df['y'].unique()[0]
        # 已经没有特征 结束分叉
        if len(data_df.columns) == 0:
            return self._majority_class(data_df['y'])
        
        # 找到最合理的分割特征
        best_feature = self._get_best_feature_index(data_df.iloc[:, :-1], data_df['y'], split_func)
        
        # 按best_feature分割后 递归建树
        child_tree = self._get_child_tree(data_df, best_feature, split_func)

        return {'feature_'+str(best_feature): dict(child_tree.items())}
#         return {best_feature: dict(child_tree.items())}
        
    def fit(self, X, y):
        self.X = np.array(X)
        self.y = np.array(y)
        data_df = pd.DataFrame(self.X).join(pd.DataFrame(self.y, columns=['y']))
        if self.model_type == 'ID3':
            self.tree = self._create_tree(data_df, calc_info_gain)
        elif self.model_type == 'C4.5':
            self.tree = self._create_tree(data_df, calc_info_gain_ratio)
        else:
            raise(Exception('model_type %s error.')%self.model_type)
    
    def get_tree(self):
        return self.tree
            
# DecisionTree().fit(X_train,y_train)

* 测试

In [3]:
# 机器学习实战测试数据
# no surfacing, flippers: fish?
dataset = np.array([[1, 1, 'yes'],
                    [1, 1, 'yes'],
                    [1, 0, 'no'],
                    [0, 1, 'no'],
                    [0, 1, 'no']])
X_train = dataset[:, :-1]
y_train = dataset[:, -1]

model = DecisionTree()
model.fit(X_train, y_train)
model.get_tree()

{'feature_0': {'0': 'no', '1': {'feature_1': {'0': 'no', '1': 'yes'}}}}

In [4]:
# 统计学习方法测试数据
# 年龄, 工作, 房子, 信贷: 是否同意贷款?
dataset = np.array([
    [1,0,0,1,0],
    [1,0,0,2,0],
    [1,1,0,2,1],
    [1,1,1,1,1],
    [1,0,0,1,0],
    [2,0,0,1,0],
    [2,0,0,2,0],
    [2,1,1,2,1],
    [2,0,1,3,1],
    [2,0,1,3,1],
    [3,0,1,3,1],
    [3,0,1,2,1],
    [3,1,0,2,1],
    [3,1,0,3,1],
    [3,0,0,1,0],
])
X_train = dataset[:, :-1]
y_train = dataset[:, -1]

model = DecisionTree()
model.fit(X_train, y_train)
model.get_tree()

{'feature_2': {0: {'feature_1': {0: 0, 1: 1}}, 1: 1}}

* plot tree

In [6]:
def createPlot(inTree):
    fig = plt.figure(1, facecolor='white')
    fig.clf()
    axprops = dict(xticks=[], yticks=[])
    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)    #no ticks
    #createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses 
    plotTree.totalW = float(getNumLeafs(inTree))
    plotTree.totalD = float(getTreeDepth(inTree))
    plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;
    plotTree(inTree, (0.5,1.0), '')
    plt.show()

    
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
    createPlot.ax1.annotate(nodeTxt, xy=parentPt,  xycoords='axes fraction',
             xytext=centerPt, textcoords='axes fraction',
             va="center", ha="center", bbox=nodeType, arrowprops=arrow_args )


TypeError: createPlot() missing 1 required positional argument: 'inTree'