In [1]:
class Node(object):
    """决策树节点"""
    
    def __init__(self, idx):
        self.idx = idx               # 节点编号，决策树中每个节点的编号是唯一的
        self.level = 0               # 节点层级
        self.father = None           # 父节点
        self.father_feature = ''     # 父节点分割feature
        self.father_value = ''       # 父节点分割value
        self.feature = ''    # 分隔节点的feature
        self.values = []     # 长度应与self.children一致
        self.children = []   # 子节点，成员为 pointer to Node
        
        self.is_leaf = False          # 是否为叶子节点
        self.predict_y_prob = None    # y概率
        self.predict_y = ''           # y预测值
        self.entrophy = None          # 节点上的熵
        self.n = None                 # 该节点上的样本数
        
    def __repr__(self):
        if self.is_leaf:
            return '[Leaf Node {}] y_hat={}'.format(self.idx, self.predict_y)
        else:
            return "[Node {}] {} in {}".format(self.idx, self.feature, self.values)
    
    def to_leaf(self):
        """决策树剪枝，将中间节点变成叶子节点"""
        self.is_leaf = True

In [32]:
class DescitionTree(object):
    """决策树"""
    
    def __init__(self):
        self.root = Node(0)
        self.data = None
        self.rule = 'C4.5'
        self.alpha = 0   
        
        # 构造决策树时的临时变量
        self._idx = 0
    
    # 打印 ------------------------------------------------------------
    def plot(self):
        """打印决策树：前序遍历 + 层级缩进"""
        
        n_space = 15
        
        for item in self.preorder():
            indent = item.level * n_space * " "
            if item.level == 0:
                txt = "[Root]"
            else:
                txt = ">>> {}={}".format(item.father_feature, item.father_value)
            txt = txt.ljust(n_space, '-')
            print(indent + txt + item.__repr__())
            
    def preorder(self):
        return self._preorder(self.root)
        
    def _preorder(self, node):
        yield node
        for node_ in node.children:
            yield from self._preorder(node_)
    
    # 预测 ------------------------------------------------------------
    def predict(self, s):
        """Prediction. Input should be pd.Series"""
        return self._predict(s, self.root)
    
    def _predict(self, s, node):
        
        if node.is_leaf:
            return node.predict_y
        else:
            v = s[node.feature]
            try:
                i = next(i for i, value in enumerate(node.values) if v == value)
            except StopIteration:
                print('prediction value not in train set: {} = {}'.format(node.feature, v))
                print('will stop at: ', node)
                return None   # todo: inner node's y
            subnode = node.children[i]
            return self._predict(s, subnode)
        
    # 剪枝 ------------------------------------------------------------
    @property
    def loss(self):
        """损失函数：sum(各叶子节点上的经验熵, weight=样本数) + alpah * 叶子节点个数"""
        lst = [node.entrophy * node.n for node in self.preorder() if node.is_leaf]
        return sum(lst) + self.alpha * len(lst)
    
    def post_pruning(self):
        """后剪枝
        
        李航书中没有说明后剪枝的顺序；
        西瓜书也没有明确说明后剪枝的顺序，但举的例子是从深度最大的节点开始；另，用的是验证集。
        
        我的具体实现如下：
        维护一个`leaf_tree`队列，初始队列由leaf-tree节点组成。这是一个带优先级的队列，优先级为节点深度
        逐次从队列中pop节点，判断是否剪枝。if 剪枝，将此节点的父节点添加进队尾
        直到队列为空为止
        """
        pass
        
    # 训练模型 ------------------------------------------------------------        
    def fit(self, df, rule='C4.5'):
        """构造决策树"""
        
        assert rule in ('ID3', 'C4.5')
        self.rule = rule
        self.data = df
        
        self._construct_subtree(df, self.root)
        
    def _construct_subtree(self, df, node):
        """递归构造决策树子树"""
        
        # print("Calling _construct_subtree... node={}".format(node))
        
        feature = self._select_feature(df)
        # print("Selected feature: {}".format(feature))
        
        node.feature = feature
        node.values = df[feature].unique().tolist()
        
        # TODO: stop iteration
        for v in node.values:
            # print("--> feature value is {}".format(v))
            df_v = df[df[feature] == v]
            del df_v[feature]
            
            self._idx += 1
            subnode = Node(self._idx)
            subnode.level = node.level + 1
            subnode.father = node
            subnode.father_feature = feature
            subnode.father_value = v
            node.children.append(subnode)
            
            y = df_v['y']
            subnode.predict_y_prob = y.value_counts() / len(y)
            subnode.predict_y = subnode.predict_y_prob.idxmax()
            subnode.entrophy = DescitionTree.em_h(y)
            subnode.n = len(y)
            
            if df_v.columns.__len__() == 1:
                # print('>>>>> 没有特征了，到达叶子节点')
                subnode.is_leaf = True
                continue
            else:
                self._construct_subtree(df_v, subnode)
        # print(">>>> 子树构造完毕，向上回溯")
            
    def _select_feature(self, df):
        """选择划分特征"""
        
        cond_entrophy_s = df.drop('y', axis=1).apply(DescitionTree.em_cond_h, y=df.y)  # H(y|A)
        feature_entrophy_s = df.drop('y', axis=1).apply(DescitionTree.em_h)   # H(A)
        HD = DescitionTree.em_h(df.y)     # H(y), scalar
        info_gain_s = HD - cond_entrophy_s
        info_gain_ratio_s = (cond_entrophy_s / feature_entrophy_s).fillna(0.0)
        
        if self.rule == 'ID3':
            return info_gain_s.idxmax()
        elif self.rule == 'C4.5':
            return info_gain_ratio_s.idxmax()
        else:
            pass
        
    @staticmethod
    def em_cond_h(x, y):
        """经验条件熵 empirical coditional entrophy, H(Y|X=xi)"""
        _df = pd.DataFrame({'x': x, 'y': y})
        condH_s = _df.groupby('x')['y'].apply(lambda s: DescitionTree.em_h(s)) # H(D_i)
        probX_s = _df.x.value_counts() / len(_df.x)   # probs or weights
        return sum(probX_s * condH_s)     
    
    @staticmethod
    def em_h(y):
        """经验熵 empirical entrophy, H(Y)"""
        y = np.asarray(y)
        probs = np.unique(y, return_counts=True)[1] / len(y)
        return DescitionTree.h(probs)
    
    @staticmethod
    def h(probs):
        """熵"""
        probs = np.asarray(probs)
        assert round(np.sum(probs), 2) == round(1.00, 2)
        assert all(probs >= 0)
        return -np.sum(probs * np.log2(probs))

In [33]:
import pandas as pd
import numpy as np

In [34]:
df = pd.DataFrame({"A": list('aaaabbbc'), 'B': list('eeeeffff'),'C':list('ggghhhii'), 'y': [1,2,3,3,6,6,6,6]})

In [35]:
tree = DescitionTree()

In [36]:
tree.fit(df)

In [37]:
tree.plot()

[Root]---------[Node 0] B in ['e', 'f']
               >>> B=e--------[Node 1] C in ['g', 'h']
                              >>> C=g--------[Node 2] A in ['a']
                                             >>> A=a--------[Leaf Node 3] y_hat=3
                              >>> C=h--------[Node 4] A in ['a']
                                             >>> A=a--------[Leaf Node 5] y_hat=3
               >>> B=f--------[Node 6] A in ['b', 'c']
                              >>> A=b--------[Node 7] C in ['h', 'i']
                                             >>> C=h--------[Leaf Node 8] y_hat=6
                                             >>> C=i--------[Leaf Node 9] y_hat=6
                              >>> A=c--------[Node 10] C in ['i']
                                             >>> C=i--------[Leaf Node 11] y_hat=6


In [38]:
y = tree.predict(pd.Series({'A':'b', 'B':'f', 'C':'h'}))
y

6

In [40]:
tree.loss

4.754887502163468

## iris example

In [41]:
from sklearn.datasets import load_iris

In [42]:
iris = load_iris()

In [43]:
features = ['sLength', 'sWidth', 'pLength', 'pWidth']
df = pd.DataFrame(iris.data, columns=features)
df = df.apply(lambda x: x.astype(int))  # to discrete var
df['y'] = iris.target

df.head()

Unnamed: 0,sLength,sWidth,pLength,pWidth,y
0,5,3,1,0,0
1,4,3,1,0,0
2,4,3,1,0,0
3,4,3,1,0,0
4,5,3,1,0,0


In [44]:
m = DescitionTree()

In [45]:
m.fit(df)

In [46]:
m.plot()

[Root]---------[Node 0] sWidth in [3, 2, 4]
               >>> sWidth=3---[Node 1] sLength in [5, 4, 7, 6]
                              >>> sLength=5--[Node 2] pWidth in [0, 1]
                                             >>> pWidth=0---[Node 3] pLength in [1]
                                                            >>> pLength=1--[Leaf Node 4] y_hat=0
                                             >>> pWidth=1---[Node 5] pLength in [4, 5]
                                                            >>> pLength=4--[Leaf Node 6] y_hat=1
                                                            >>> pLength=5--[Leaf Node 7] y_hat=2
                              >>> sLength=4--[Node 8] pLength in [1]
                                             >>> pLength=1--[Node 9] pWidth in [0]
                                                            >>> pWidth=0---[Leaf Node 10] y_hat=0
                              >>> sLength=7--[Node 11] pWidth in [1, 2]
                                      

In [47]:
m.predict(df.loc[42])

0

In [48]:
df['y_hat'] = df.apply(m.predict, axis=1)

In [49]:
df[df.y != df.y_hat]

Unnamed: 0,sLength,sWidth,pLength,pWidth,y,y_hat
77,6,3,5,1,1,2
83,6,2,5,1,1,2
123,6,2,4,1,2,1
126,6,2,4,1,2,1
127,6,3,4,1,2,1
138,6,3,4,1,2,1


In [50]:
m.loss

22.820326594927444

In [492]:
b = 99
a = [1,2,3, b]

In [494]:
del b

In [495]:
a

[1, 2, 3, 99]