# CART回归树
- 树型回归是一种区域回归（相对于线性回归）
- cart算法被用来生成回归树时采用平方误差最小作为属性划分依据
- 算法停止条件是训练数据小于某种程度、平方误差小于某种程度、属性数据完全一致等情况

In [1]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
class TreeNode(object):
    # 根据标签序列计算gini指数
    def get_square_error(self, labels_list):
        """
        :labels_list, ndarray, 标签列表、数组
        :n_samples, int, 总的类别数
        计算信息熵
        """
        labels_mean = np.mean(labels_list)
        return np.sum(np.square(labels_list - labels_mean))

    def __init__(self, data_x, data_y, attr_is_dispersed, square_error_threshold):
        '''
        :data_x: ndarray, 标签化的数据集x
        :data_y: ndarray, 标签化的数据集y
        :attr_is_dispersed, ndarray, 属性是否为离散的，例如[1,0,1,0,1],则表示下标0、2、4的属性是离散的，
        '''
        self.data_x = data_x
        self.data_y = data_y
        self.next_nodes = {}  # 存储子节点
        self.is_leaf = None
        self.seg_attr_index = -1  # 在该节点选择的分割属性
        self.seg_attr_value = -1  # 分割属性值，返回属性分割值

        n_samples, n_features = data_x.shape
        # 当前节点数据数不大于1、数据标签y全部一致的情况下，认定为子节点
        square_error = self.get_square_error(self.data_y)
        if n_samples <= 1 or square_error < square_error_threshold:
            self.is_leaf = True
        else:  # 非叶子节点
            #             gain = self.get_Information_entropy(self.data_y, n_samples)  # 熵
            self.is_leaf = False
            # 根据可用分割属性segmentation_attr，以及属性离散/连续记录表attr_is_dispersed来确定最优
            temp_se = np.inf  # 保存最小的平方误差
            temp_attr_index = -1  # 保存按哪个属性分割可以得到最小的平方误差

            # 返回包含分割点的属性值
            temp_attr_seg = None
            # 如果优属性是连续的，则返回升序属性序列的分割点下标，方便后续计算
            temp_continus_attr_seg_index = -1
            for attr_index in range(len(attr_is_dispersed)):
                if attr_is_dispersed[attr_index] == 1:  # 离散值
                    uniques_attr, uniques_attr_counts = np.unique(
                        self.data_x[:, attr_index], return_counts=True)
                    if len(uniques_attr) == 1:
                        continue
                    for cur_attr_label in uniques_attr:
                        cur_mask = (self.data_x[:, attr_index] == cur_attr_label)
                        cur_not_mask = (self.data_x[:, attr_index] != cur_attr_label)
                        cur_se = self.get_square_error(self.data_y[cur_mask]) + \
                                   self.get_square_error(self.data_y[cur_not_mask])
                        if temp_se > cur_se:
                            temp_se = cur_se
                            temp_attr_index = attr_index
                            temp_attr_seg = np.array([cur_attr_label])

                else:  # 连续值,需要寻找一个最优的二分点来分割数据，需要进行n_samples-1次尝试
                    sort_index = np.argsort(self.data_x[:, attr_index])

                    for i in range(n_samples - 1):
                        temp_continus_seg_value = (data_x[sort_index[i], attr_index] + \
                                                   data_x[sort_index[i + 1], attr_index]) / 2.0
                        cur_se = self.get_square_error(data_y[sort_index[:i + 1]]) + \
                                   self.get_square_error(data_y[sort_index[i + 1:]])
                        if temp_se > cur_se:
                            temp_se = cur_se
                            temp_attr_index = attr_index
                            temp_attr_seg = np.array([temp_continus_seg_value])
                            temp_continus_attr_seg_index = i
            if temp_se == np.inf:
                self.is_leaf = True
                return
            # 利用最优属性进行划分，并创造该节点的子节点，保存在self.next_nodes结构中
            self.seg_attr_index = temp_attr_index  # 该节点的分割属性（轴）的下标
            self.seg_attr_value = temp_attr_seg[0]

            # 最优属性是离散值
            if attr_is_dispersed[self.seg_attr_index] == 1:
                cur_mask = (self.data_x[:, self.seg_attr_index] == self.seg_attr_value)
                cur_not_mask = (self.data_x[:, self.seg_attr_index] != self.seg_attr_value)
                self.next_nodes[0] = TreeNode(self.data_x[cur_mask],
                                              self.data_y[cur_mask],
                                              attr_is_dispersed,
                                             square_error_threshold)

                self.next_nodes[1] = TreeNode(self.data_x[cur_not_mask],
                                              self.data_y[cur_not_mask],
                                              attr_is_dispersed,
                                             square_error_threshold)
            else:  # 最优属性是连续值
                sort_index = np.argsort(self.data_x[:, temp_attr_index])
                # 不大于分割属性的子节点
                self.next_nodes[0] = TreeNode(
                    data_x[sort_index[:temp_continus_attr_seg_index + 1]],
                    data_y[sort_index[:temp_continus_attr_seg_index + 1]],
                    attr_is_dispersed,
                    square_error_threshold)  # left
                # 大于分割属性的子节点
                self.next_nodes[1] = TreeNode(
                    data_x[sort_index[temp_continus_attr_seg_index + 1:]],
                    data_y[sort_index[temp_continus_attr_seg_index + 1:]],
                    attr_is_dispersed,
                    square_error_threshold)  # right

In [25]:
class DecisionTree(object):
    def __init__(self, train_x, train_y, attributes_classs=None):
        self.train_x = train_x
        self.train_y = train_y

        # 确定属性是连续/离散的,当属性中唯一值数量多于N/2时，认定为连续值
        n_samples, n_features = self.train_x.shape
        if attributes_classs is None:
            attributes_classs = [0] * n_samples
            for i in range(n_features):
                uniques_i = np.unique(self.train_x[:, i])
                if uniques_i * 3 > n_samples:
                    attributes_classs[i] = 0
        self.attributes_classs = attributes_classs
        # 对离散属性、label做encoder处理
        self.xLabelEncoders = []
        for i in range(len(self.attributes_classs)):
            if self.attributes_classs[i] == 1:  # 离散属性
                cur_encoder = LabelEncoder()
                cur_encoder.fit(self.train_x[:, i])
                self.train_x[:, i] = cur_encoder.transform(self.train_x[:, i])
                self.xLabelEncoders.append(cur_encoder)
            else:
                self.xLabelEncoders.append(None)
        self.root = None  # 根节点

    def train(self, square_error_threshold=0.8):
        self.root = TreeNode(self.train_x, self.train_y, self.attributes_classs, 
                             square_error_threshold)

    def fit(self):
        self.train()

    def predict(self, test_x):
        # 从根节点开始遍历树形结构，
        if self.root is None:
            raise RuntimeError("value is None, error")
        # 首先将测试数据集的离散属性LAbelencoder,然后进行预测
        for (i, x_label_encoder) in enumerate(self.xLabelEncoders):
            if x_label_encoder is not None:
                test_x[:, i] = x_label_encoder.transform(test_x[:, i],)
        n_samples, _ = test_x.shape
        pre = np.zeros(n_samples)
        for i in xrange(n_samples):
            cur_node = self.root
            while cur_node.is_leaf is False:
                cur_attr_index = cur_node.seg_attr_index  # 分割属性下标
                if self.attributes_classs[cur_attr_index] == 1:  # 离散属性
                    if test_x[i][cur_attr_index] == cur_node.seg_attr_value:
                        cur_node = cur_node.next_nodes[0]
                    else:
                        cur_node = cur_node.next_nodes[1]
                else:  # 连续属性
                    if test_x[i][cur_attr_index] <= cur_node.seg_attr_value:
                        cur_node = cur_node.next_nodes[0]
                    else:
                        cur_node = cur_node.next_nodes[1]
            # 找到了叶子节点，可以展开预测动作了，取平均值
            pre[i] = np.mean(cur_node.data_y)
        return pre

In [11]:
def load_data(path='../data/StudentsPerformance.csv'):
    sp = pd.read_csv(path, sep=',')
    data = sp.values
    data_x = data[:,:-1]
    data_y = data[:,-1]
    return data_x, data_y

In [12]:
data_x, data_y = load_data()

In [47]:
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.2)

In [48]:
train_x[0]

array(['male', 'group C', 'some college', 'standard', 'none', 59, 41], dtype=object)

In [50]:
attributes_classs = [1, 1, 1, 1, 1, 0, 0]

In [51]:
dt = DecisionTree(train_x, train_y, attributes_classs)

In [52]:
dt.train(square_error_threshold=1)

In [53]:
pre_y = dt.predict(test_x)

In [54]:
print np.sqrt(np.mean(np.square(pre_y - test_y)))

5.21691958918


In [55]:
from sklearn.tree import DecisionTreeRegressor

In [56]:
dt_regre = DecisionTreeRegressor(criterion='mse')

In [57]:
dt_regre.fit(train_x, train_y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [58]:
sklearn_pre_y = dt_regre.predict(test_x)

In [59]:
print np.sqrt(np.mean(np.square(sklearn_pre_y - test_y)))

5.58077055611
