## CART

Classification and Regression Tree, aka, C&RTree

- Classification: similar to ID3 / C4.5, but
    + continuous x
    + binary tree
    + gini index
- Regression Tree: minimize lease square


## 最小二乘回归树

select feature $j$, splitting point $s$, such that summed square error is minimized, ie.

$$
j, s = \text{argmin}_{j,s} \Big[ \min_{c_1} \sum_{R1: x_j < s} (y_i - c_1)^2 + \min_{c_2} \sum_{R2: x_j > s} (y_i - c_2)^2 \Big] 
$$

==> 两个区域的（样本数加权的）方差之和最小



In [36]:
import pandas as pd
import numpy as np

    
class Node(object):
    """回归树节点"""
    
    def __init__(self, idx, father, is_left):
        self.idx = idx
        self.father = father
        self.is_left = is_left  # 这个节点本身是左子树还是右子树
        
        self.left = None
        self.right = None
        self.path = []   # list of nodes
        self.reign = None  # tuple (feature, <= or >, value)
        
        self.feature = ''
        self.value = None
        
        self.y_hat = None
        self.n = None
        
    def __repr__(self):
        return '[Node {}] {}={}'.format(self.idx, self.feature, self.value)
    
    @property
    def label(self):
        if self.is_leaf:
            return 'y_hat = {}\nn = {}'.format(self.y_hat, self.n)
        else:
            return '{} <= {} ?'.format(self.feature, self.value)
        
    @property
    def is_leaf(self):
        return not bool(self.left or self.right)
    
    
class CART(object):
    """回归树"""
    
    def __init__(self):
        self.root = Node(0, None, None)
        self.data = None
        
        self.splitnum = 10  # 模型参数：确定分割点用的分位数个数
        
        self._idx = 1
        
    # plot --------------------------------------------------------
    
    def plot(self, fname='default', fmt='png'):
        """画图
        
        使用grpahviz库
        后序遍历，因为要定义节点、再定义边
        ```
        for node in self.postorder():
            g.node(...)  # 添加节点
            if node.has_child:
                g.edge(node, c) for c in child  # 添加边。因为是后序遍历，子节点肯定已经添加过了。
        ```
        """
        
        import graphviz
        g = graphviz.Digraph()
        
        g.filename = fname
        g.format = fmt
        
        # TODO  temporary only allow less than 12 features
        colors = [  '#8dd3c7',
                    '#ffffb3',
                    '#bebada',
                    '#fb8072',
                    '#80b1d3',
                    '#fdb462',
                    '#b3de69',
                    '#fccde5',
                    '#d9d9d9',
                    '#bc80bd',
                    '#ccebc5',
                    '#ffed6f']
        feature_color_map = {fe: colors[i] for fe,i in zip(self.data.columns, range(12))}
        
        for node in self.postorder():
            g.node(str(node.idx), label=node.label, 
                   shape='box' if node.is_leaf else 'ellipse',
                   style='filled', 
                   color=feature_color_map[node.feature] if node.feature else 'grey'
                  )
            
            if node.left:
                g.edge(str(node.idx), str(node.left.idx), label='Yes')
            if node.right:
                g.edge(str(node.idx), str(node.right.idx), label='No')
        return g
        
    def postorder(self):
        return self._postorder(self.root)
    
    def _postorder(self, node):
        """递归后序遍历"""
        
        if node.left:
            yield from self._postorder(node.left)
        if node.right:
            yield from self._postorder(node.right)
        yield node
        
    # train --------------------------------------------------------
    
    def fit(self, df):
        self.data = df
        self._construct_cart(df, self.root)
        
    def _construct_cart(self, df, node):
        """递归构造回归树"""
        
        node.y_hat = np.mean(df.y.values)
        node.n = len(df)
        
        if len(df.columns) == 1:
            return
        elif len(df) <= 1:
            return
        else:
            pass
        
        feature, value = self._select_feature_value(df)
        
        node.feature = feature
        node.value = value
        
        # todo 怎么设计。 reign可以设计成函数
        # node.path = node.father.path + [self]
        # node.reign = (node.father.feature if node.father else None, '<=' if node.is_left else '>', node.father.value if node.father else None)
        
        node.left = Node(idx=self._idx, father=node, is_left=True) # todo
        self._idx += 1
        
        node.right = Node(idx=self._idx, father=node, is_left=False)
        self._idx += 1
        
        self._construct_cart(df[df[feature] <= value].drop(feature, axis=1), node.left)
        self._construct_cart(df[df[feature] > value].drop(feature, axis=1), node.right)
        
    def _select_feature_value(self, df):
        """特征和切分点选择"""
        
        assert isinstance(df, pd.DataFrame)
        
        min_MSE = None
        
        features = set(df.columns) - {'y'}
        for fe in features:
            cut_points = df[fe].quantile([x / self.splitnum for x in range(self.splitnum)])
            for s in cut_points:
                MSE = CART.SSE(df[df[fe] <= s].y) + CART.SSE(df[df[fe] > s].y)
                
                if min_MSE is None or MSE < min_MSE:
                    min_MSE = MSE
                    res_fe, res_s = fe, s
        
        return res_fe, res_s
                
    @staticmethod
    def SSE(y, y_hat=None):
        """Sum of Squared Error"""
        if y_hat is None:
            y_hat = np.mean(y)
        return np.sum((y - y_hat) ** 2)

In [37]:
from sklearn.datasets import load_iris

iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['y'] = iris.target


m = CART()
m.fit(df)

g = m.plot('iris-CART-plot', 'pdf')

import graphviz

g.view()

'iris-CART-plot.pdf'

## TODO

- CART pruning