# 决策树
- 首先实现用于分类任务的决策树，需要对数据进行预处理：
    1. 对label进行编码，由sklearn.LabelEncoder实现，在预测时将结果反编码
    2. 对离散属性编码，
    3. 对连续属性进行分段处理转化为category类型，然后编码
    4. 使用嵌套字典形式存储树结构
    5. 通过信息熵计算信息增益

In [2]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [None]:
class TreeNode(object):
    def get_Information_entropy(self, labels_list, n_samples):
        """
        :labels_list, ndarray, 标签列表、数组
        :n_samples, int, 总的类别数
        计算信息熵
        """
        _, label_counts = np.unique(labels_list, return_counts=True)
        p = labels_counts*1.0 / n_samples
        return -np.sum(p*np.log2(p))
        
    def __init__(self, data_x, data_y, segmentation_attr, attr_is_dispersed):
        '''
        :data_x: ndarray, 标签化的数据集x
        :data_y: ndarray, 标签化的数据集y
        :segmentation_attr, list, 当前可用的分割属性下标列表，例如[1,5,7],表示对于当前节点只有1、5、7可以
        被用作属性分割
        :attr_is_dispersed, ndarray, 属性是否为离散的，例如[1,0,1,0,1],则表示下标0、2、4的属性是离散的，
        '''
        self.data_x = data_x
        self.data_y = data_y
        self.segmentation_attr = segmentation_attr.copy()
        self.next_nodes = {} # 存储子节点
        
        n_samples, n_features = data.shape
        # 当前节点数据数不大于10、无可用分割属性、数据标签y全部一致的情况下，认定为子节点
        uniques_y = np.unique(self.data_y, return_counts=False) 
        if n_samples <= 10 or len(self.segmentation_attr) == 0 or len(uniques_y) == 1:
            self.is_leaf = True
        else: # 非叶子节点
            gain = self.get_Information_entropy(self.data_y, n_samples)
            self.is_leaf = False
            # 根据可用分割属性segmentation_attr，以及属性离散/连续记录表attr_is_dispersed来确定最优
            # 分割属性,默认采用信息增益的方式，ID3算法
            temp_gain = -1
            temp_attr_index = -1
            temp_attr_seg = None
            for attr_index in self.segmentation_attr:
                if attr_is_dispersed[attr_index] == 1: # 离散值
                    uniques_attr, uniques_attr_counts = np.unique(
                        self.data_x[:,attr_index], return_counts=True)
                    temp_information_gains = []
                    for cur_attr_label in uniques_attr:
                        cur_mask = (self.data_x[:, self.attr_index] == cur_attr_label)
                        temp_information_gains.append(
                            self.get_Information_entropy(self.data_y[cur_mask], len(cur_mask)))
                    cur_gain = np.sum((uniques_attr_counts*1.0 / n_samples) * 
                                      temp_information_gains)
                    if temp_gain < (gain - cur_gain):
                        temp_gain = (gain - cur_gain)
                        temp_attr_index = attr_index
                        temp_attr_seg = uniques_attr
                else: # 连续值,需要寻找一个最优的二分点来分割数据，需要进行n_samples-1次尝试
                    sort_index = np.argsort(self.data_x[:, attr_index])
                    temp_continus_attr_seg = -1
                    temp_continus_gain = np.inf
                    temp_sort_index = -1
                    for i in range(n_samples-1):
                        temp_seg = (data_x[i, attr_index] + data_x[i+1, attr_index]) / 2.0
                        temp_continuous_left = self.get_Information_entropy(
                            data_y[sort_index[:i+1]], i+1)
                        temp_continuous_right = self.get_Information_entropy(
                            data_y[sort_index[i+1:]], n_samples-i-1)
                        cur_continus_gains = (i+1.0)/n_samples * temp_continuous_left + \
                        (n_samples-i-1.0)*temp_continuous_right
                        if temp_continus_gain > cur_continus_gains:
                            temp_continus_gain = cur_continus_gains
                            temp_continus_attr_seg = temp_seg
                            temp_sort_index = i
                        cur_gain = temp_continus_gain
                        if temp_gain < (gain - cur_gain):
                            temp_gain = (gain - cur_gain)
                            temp_attr_index = attr_index
                            temp_attr_seg = np.array([temp_sort_index])
                            
            # 利用最优属性进行划分，并创造该节点的子节点，保存在self.next_nodes结构中
            
            # 最优属性是离散值
            self.seg_attr_index = temp_attr_index  # 该节点的分割属性（轴）的下标
            if attr_is_dispersed[temp_attr_index] == 1:
                self.segmentation_attr.remove(temp_attr_index) # 从备用分割属性列表中删除最优属性
                for cur_attr_label in temp_attr_seg:
                    cur_mask = (self.data_x[:, self.attr_index] == cur_attr_label)
                    self.next_nodes[cur_attr_label] = TreeNode(self.data_x[cur_mask], 
                                                         self.data_y[cur_mask], 
                                                         self.segmentation_attr,
                                                        attr_is_dispersed)
            else: # 最优属性是连续值
                sort_index = np.argsort(self.data_x[:, temp_attr_index])
                seg_data = (data_x[temp_attr_index[0], temp_attr_index] + \
                 data_x[temp_attr_index[0]+1, temp_attr_index]) / 2.0
                self.next_nodes[0] = TreeNode(data_x[sort_index[:temp_attr_seg[0]+1]],
                                            data_y[sort_index[:temp_attr_seg[0]+1]],
                                            self.segmentation_attr,
                                            attr_is_dispersed)  # left
                self.next_nodes[1] = TreeNode(data_x[sort_index[temp_attr_seg[0]+1:]],
                                            data_y[sort_index[temp_attr_seg[0]+1:]],
                                            self.segmentation_attr,
                                            attr_is_dispersed) # right     

In [None]:
class DecisionTree(object):
    def __init__(self, train_x, train_y, attributes_classs=None, 
                 criterion='entropy'):
        self.train_x = train_x
        self.train_y = train_y
        
        # 确定属性是连续/离散的,当属性中唯一值数量多于N/2时，认定为连续值
        n_samples, n_features = self.train_x.shape
        if attributes_classs is None:
            attributes_classs = [0] * n_samples
            for i in range(n_features):
                uniques_i = np.unique(self.train_x[:,i])
                if uniques_i*3 > n_samples:
                    attributes_classs[i] = 0    
        self.attributes_classs = attributes_classs
        # 对离散属性、label做encoder处理
        self.xLabelEncoders = []
        for i in range(len(self.attributes_classs)):
            if self.attributes_classs[i] == 0:   # 离散属性
                cur_encoder = LabelEncoder()
                cur_encoder.fit(self.train_x[:,i])
                self.train_x[:,i] = cur_encoder.transform(self.train_x[:,i])
                self.xLabelEncoders.append(cur_encoder)
            else:
                self.xLabelEncoders.append(None)
        self.yLabelEncoders = LabelEncoder()
        self.yLabelEncoders.fit(train_y)
        train_y = self.yLabelEncoders.transform(train_y)
        self.root = None # 根节点
                    
    def train(self):
        self.root = TreeNode(train_x, train_y, range(n_features), 
                             self.attributes_classs)
    def fit(self):
        self.train()
    def predict(self, test_x):
        if self.root is None:
            raise "value error"
        n_samples, _ = test_x.shape
        pre = np.zeros(n_samples)
        for i in range(n_samples):
            cur_node = self.root
            while cur_node.is_leaf == False:
                cur_attr_index = cur_node.seg_attr_index
                if self.attributes_classs[cur_attr_index] == 1: # 离散属性
                    for key, value in cur_node.next_nodes.items():
                        if key == test_x[i][cur_attr_index]:
                            cur_node = value
                else:  #连续属性
                    if test_x[i][cur_attr_index] < cur_node.temp_attr_seg:
                        pass
                    else:
                        pass