# 决策树
- 首先实现用于分类任务的决策树，需要对数据进行预处理：
    1. 对label进行编码，由sklearn.LabelEncoder实现，在预测时将结果反编码
    2. 对离散属性编码，
    3. 对连续属性进行分段处理转化为category类型，然后编码
    4. 使用嵌套字典形式存储树结构
    5. 通过信息熵计算信息增益

## ID3算法

In [11]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_wine, load_breast_cancer

In [76]:
class TreeNode(object):
    def get_Information_entropy(self, labels_list, n_samples):
        """
        :labels_list, ndarray, 标签列表、数组
        :n_samples, int, 总的类别数
        计算信息熵
        """
        _, label_counts = np.unique(labels_list, return_counts=True)
        p = label_counts * 1.0 / n_samples
        return -np.sum(p * np.log2(p))

    def __init__(self, data_x, data_y, segmentation_attr, attr_is_dispersed):
        '''
        :data_x: ndarray, 标签化的数据集x
        :data_y: ndarray, 标签化的数据集y
        :segmentation_attr, list, 当前可用的分割属性下标列表，例如[1,5,7],表示对于当前节点只有1、5、7可以
        被用作属性分割
        :attr_is_dispersed, ndarray, 属性是否为离散的，例如[1,0,1,0,1],则表示下标0、2、4的属性是离散的，
        '''
        self.data_x = data_x
        self.data_y = data_y
        self.segmentation_attr = segmentation_attr
        self.next_nodes = {}  # 存储子节点
        self.is_leaf = None
        self.seg_attr_index = -1  # 在该节点选择的分割属性
        self.seg_attr_value = -1  # 分割属性值，如果是离散的，则返回属性唯一值序列，否则返回只包含一
        # 个分割点属性值的序列

        n_samples, n_features = data_x.shape
        # 当前节点数据数不大于10、无可用分割属性、数据标签y全部一致的情况下，认定为子节点
        uniques_y = np.unique(self.data_y, return_counts=False)
        if n_samples <= 1 or len(self.segmentation_attr) == 0 or len(uniques_y) == 1:
            self.is_leaf = True
        else:  # 非叶子节点
            gain = self.get_Information_entropy(self.data_y, n_samples)  # 熵
            self.is_leaf = False
            # 根据可用分割属性segmentation_attr，以及属性离散/连续记录表attr_is_dispersed来确定最优
            # 分割属性,默认采用信息增益的方式，ID3算法
            temp_gain = -1  # 保存最小的条件熵
            temp_attr_index = -1  # 保存按哪个属性分割可以得到最小的条件熵

            # 保存该属性分割点的值，如果是离散的，则返回的列表包括属性的唯一值序列，否则仅
            # 返回包含分割点的属性值
            temp_attr_seg = None
            # 如果优属性是连续的，则返回升序属性序列的分割点下标，方便后续计算
            temp_continus_attr_seg_index = -1
            for attr_index in self.segmentation_attr:
                if attr_is_dispersed[attr_index] == 1:  # 离散值
                    uniques_attr, uniques_attr_counts = np.unique(
                        self.data_x[:, attr_index], return_counts=True)
                    temp_information_gains = []
                    for cur_attr_label in uniques_attr:
                        cur_mask = (self.data_x[:, attr_index] == cur_attr_label)
                        temp_information_gains.append(
                            self.get_Information_entropy(self.data_y[cur_mask], np.sum(cur_mask)))
                    cur_gain = np.sum((uniques_attr_counts * 1.0 / n_samples) *
                                      temp_information_gains)
                    if temp_gain < (gain - cur_gain): # id3，信息增益
                        temp_gain = (gain - cur_gain)
                        temp_attr_index = attr_index
                        temp_attr_seg = uniques_attr
                else:  # 连续值,需要寻找一个最优的二分点来分割数据，需要进行n_samples-1次尝试
                    sort_index = np.argsort(self.data_x[:, attr_index])
                    temp_continus_attr_seg = -1
                    temp_continus_gain = np.inf
                    temp_sort_index = -1
                    for i in range(n_samples - 1):
                        temp_continus_seg_value = (data_x[i, attr_index] + \
                                                   data_x[i + 1, attr_index]) / 2.0
                        temp_continuous_left = self.get_Information_entropy(
                            data_y[sort_index[:i + 1]], i + 1)
                        temp_continuous_right = self.get_Information_entropy(
                            data_y[sort_index[i + 1:]], n_samples - i - 1)
                        # 计算连续属性的条件熵
                        cur_continus_gains = (i + 1.0) / n_samples * temp_continuous_left + \
                                             (n_samples - i - 1.0) / n_samples * temp_continuous_right
                        if temp_continus_gain > cur_continus_gains:
                            temp_continus_gain = cur_continus_gains
                            temp_continus_attr_seg = temp_continus_seg_value
                            temp_sort_index = i
                        cur_gain = temp_continus_gain
                        if temp_gain < (gain - cur_gain):  # id3，信息增益
                            temp_gain = (gain - cur_gain)
                            temp_attr_index = attr_index
                            temp_attr_seg = np.array([temp_continus_attr_seg])
                            temp_continus_attr_seg_index = temp_sort_index
            # 利用最优属性进行划分，并创造该节点的子节点，保存在self.next_nodes结构中

            self.seg_attr_index = temp_attr_index  # 该节点的分割属性（轴）的下标
            self.seg_attr_value = temp_attr_seg

            # 最优属性是离散值
            if attr_is_dispersed[temp_attr_index] == 1:
                self.segmentation_attr.remove(temp_attr_index)  # 从备用分割属性列表中删除最优属性
                for cur_attr_label in temp_attr_seg:
                    cur_mask = (self.data_x[:, self.seg_attr_index] == cur_attr_label)
                    self.next_nodes[cur_attr_label] = TreeNode(self.data_x[cur_mask],
                                                               self.data_y[cur_mask],
                                                               self.segmentation_attr,
                                                               attr_is_dispersed)
            else:  # 最优属性是连续值
                sort_index = np.argsort(self.data_x[:, temp_attr_index])
                # 不大于分割属性的子节点
                self.next_nodes[0] = TreeNode(
                    data_x[sort_index[:temp_continus_attr_seg_index + 1]],
                    data_y[sort_index[:temp_continus_attr_seg_index + 1]],
                    self.segmentation_attr,
                    attr_is_dispersed)  # left
                # 大于分割属性的子节点
                self.next_nodes[1] = TreeNode(
                    data_x[sort_index[temp_continus_attr_seg_index + 1:]],
                    data_y[sort_index[temp_continus_attr_seg_index + 1:]],
                    self.segmentation_attr,
                    attr_is_dispersed)  # right

In [158]:
class DecisionTree(object):
    def __init__(self, train_x, train_y, attributes_classs=None):
        self.train_x = train_x
        self.train_y = train_y

        # 确定属性是连续/离散的,当属性中唯一值数量多于N/2时，认定为连续值
        n_samples, n_features = self.train_x.shape
        if attributes_classs is None:
            attributes_classs = [0] * n_samples
            for i in range(n_features):
                uniques_i = np.unique(self.train_x[:, i])
                if uniques_i * 3 > n_samples:
                    attributes_classs[i] = 0
        self.attributes_classs = attributes_classs
        # 对离散属性、label做encoder处理
        self.xLabelEncoders = []
        for i in range(len(self.attributes_classs)):
            if self.attributes_classs[i] == 1:  # 离散属性
                cur_encoder = LabelEncoder()
                cur_encoder.fit(self.train_x[:, i])
                self.train_x[:, i] = cur_encoder.transform(self.train_x[:, i])
                self.xLabelEncoders.append(cur_encoder)
            else:
                self.xLabelEncoders.append(None)
        self.yLabelEncoders = LabelEncoder()
        self.yLabelEncoders.fit(train_y)
        self.train_y = self.yLabelEncoders.transform(self.train_y)
        self.root = None  # 根节点

    def train(self):
        self.root = TreeNode(self.train_x, self.train_y, range(len(self.attributes_classs)),
                             self.attributes_classs)

    def fit(self):
        self.train()

    def predict(self, test_x):
        # 从根节点开始遍历树形结构，
        if self.root is None:
            raise RuntimeError("value is None, error")
        # 首先将测试数据集的离散属性LAbelencoder,然后进行预测
        for (i, x_label_encoder) in enumerate(self.xLabelEncoders):
            if x_label_encoder is not None:
                test_x[:, i] = x_label_encoder.transform(test_x[:, i])
        n_samples, _ = test_x.shape
        pre = np.zeros(n_samples)
        for i in xrange(n_samples):
            cur_node = self.root
            while cur_node.is_leaf is False:
                cur_attr_index = cur_node.seg_attr_index  # 分割属性下标
                if self.attributes_classs[cur_attr_index] == 1:  # 离散属性
                    if test_x[i][cur_attr_index] in cur_node.next_nodes.keys():
                        cur_node = cur_node.next_nodes[test_x[i][cur_attr_index]]
                        break
                    else:
                        raise Exception('the attr is not in Decision Tree')
                else:  # 连续属性
                    if test_x[i][cur_attr_index] <= cur_node.seg_attr_value:
                        cur_node = cur_node[0]
                    else:
                        cur_node = cur_node[1]
            # 找到了叶子节点，可以展开预测动作了，基本的投票原则
            pre[i] = np.argmax(np.bincount(cur_node.data_y))
        pre = pre.astype(np.int)
        return self.yLabelEncoders.inverse_transform(pre)  # 反向装换

### 遍历决策树
- 作为绘制决策树的基础

In [188]:
# 决策树的遍历，以字典的形式返回
def list_dt(root, attr_is_dispersed):
    if root.is_leaf is True:
        return np.argmax(np.bincount(root.data_y)) # 返回预测标签
    else:
        temp_dic = {}
        if attr_is_dispersed[root.seg_attr_index] == 1:
            for next_node in root.next_nodes.keys():
                temp_dic[str(root.seg_attr_index)+' is '+str(next_node)] = list_dt(root.next_nodes[next_node],
                                                                                 attr_is_dispersed)
        else:
            temp_dic[str(root.seg_attr_index)+' <= ' + str(root.seg_attr_value)] = list_dt(root.next_nodes[0],
                                                                                         attr_is_dispersed)
            temp_dic[str(root.seg_attr_index)+' > ' + str(root.seg_attr_value)] = list_dt(root.next_nodes[1],
                                                                                        attr_is_dispersed)
        return temp_dic

## 训练决策树

### 加载数据集 

In [172]:
def load_data(path = '../data/heart.csv'):
    weather = pd.read_csv(path,sep=',')
    data = weather.values
    data_x = data[:,:-1]
    data_y = data[:, -1]
    return data_x, data_y

In [142]:
data_x, data_y = load_data()

In [173]:
features_mask = [1, 5, 8, 12]

In [174]:
data_x = data_x[:,features_mask]

In [175]:
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.2)

In [159]:
attributes_classs = np.array([0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1])

In [176]:
dt = DecisionTree(train_x, train_y, attributes_classs[features_mask])

In [177]:
dt.train()

In [178]:
pre_y =  dt.predict(test_x)

In [179]:
print '测试集正确率：%f' % (np.sum(pre_y == test_y) * 1.0 / test_y.shape[0])

测试集正确率：0.754098


In [180]:
pre_train_y = dt.predict(train_x)

In [181]:
print '训练集正确率：%f' % (np.sum(pre_train_y == train_y) * 1.0 / train_y.shape[0])

训练集正确率：0.768595


In [168]:
from sklearn.tree import DecisionTreeClassifier

In [169]:
sk_dt = DecisionTreeClassifier(criterion='entropy')

In [182]:
sk_dt.fit(train_x, train_y)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [183]:
sk_pre = sk_dt.predict(test_x)

In [184]:
print np.sum(sk_pre == test_y)*1.0 / test_x.shape[0]

0.72131147541


In [185]:
sk_train_pre = sk_dt.predict(train_x)

In [186]:
print np.sum(sk_train_pre == train_y)*1.0 / train_x.shape[0]

0.789256198347


In [190]:
import graphviz

ImportError: No module named graphviz