In [2]:
class Node(object):
    """决策树节点"""
    
    def __init__(self, idx):
        self.idx = idx               # 节点编号，决策树中每个节点的编号是唯一的
        self.level = 0               # 节点层级
        self.father = None           # 父节点
        self.father_feature = ''     # 父节点分割feature
        self.father_value = ''       # 父节点分割value
        self.feature = ''    # 分隔节点的feature
        self.values = []     # 长度应与self.children一致
        self.children = []   # 子节点，成员为 pointer to Node
        
    def __repr__(self):
        return "<Node {}: {}>".format(self.idx, self.feature)

In [121]:
class DescitionTree(object):
    """决策树"""
    
    def __init__(self):
        self.root = Node(0)

    def plot(self):
        """打印决策树：前序遍历 + 层级缩进"""
        
        for item in self.preorder():
            indent = item.level * "\s"
            txt = "{} == {}".format(item.father_feature, item.father_value).ljust(10, '-')
            print(indent, txt,item.__repr__())
            
    def preorder(self):
        self._preorder(self.root)
        
    def _preorder(self, node):
        yield node
        for node_ in node.children:
            yield from self._preorder(node_)
            
    def fit(self, df):
        """构造决策树"""
        
        # 构造决策树时的临时变量
        _idx = 0
        
        # 递归构造决策树子树
        self._construct_subtree(df, self.root)
        
    def _construct_subtree(self, df, node):
        """递归构造决策树子树"""
        
        feature = self._select_feature(df) # todo
        
        node.feature = feature
        node.values = df[feature].unique().tolist()
        
        #nonlocal _idx
        # TODO: stop iteration
        for v in values:
            df_v = df[df[feature] == v]
            del df_v[feature]
            subnode = Node(_idx)
            subnode.level = node.level + 1
            subnode.father = node
            subnode.father_feature = feature
            subnode.father_value = v
            idx += 1
            
            node.children.append(subnode)
            self._construct_subtree(df_v, subnode)
            
    def _select_feature(self, df, rule='ID3'):
        """选择划分特征"""
        
        assert rule in ('ID3', 'C4.5')
        
        cond_entrophy_s = df.drop('y', axis=1).apply(DescitionTree.em_cond_h, y=df.y)  # H(y|A)
        feature_entrophy_s = df.drop('y', axis=1).apply(DescitionTree.em_h)   # H(A)
        HD = DescitionTree.em_h(df.y)     # H(y), scalar
        info_gain_s = HD - cond_entrophy_s
        info_gain_ratio_s = cond_entrophy_s / feature_entrophy_s
        
        if rule == 'ID3':
            return info_gain_s.idxmax()
        elif rule == 'C4.5':
            return info_gain_ratio_s.idxmax()
        else:
            pass
        
    @staticmethod
    def em_cond_h(x, y):
        """经验条件熵 empirical coditional entrophy, H(Y|X=xi)"""
        _df = pd.DataFrame({'x': x, 'y': y})
        condH_s = _df.groupby('x')['y'].apply(lambda s: DescitionTree.em_h(s)) # H(D_i)
        probX_s = _df.x.value_counts() / len(_df.x)   # probs or weights
        return sum(probX_s * condH_s)     
    
    @staticmethod
    def em_h(y):
        """经验熵 empirical entrophy, H(Y)"""
        y = np.asarray(y)
        probs = np.unique(y, return_counts=True)[1] / len(y)
        return DescitionTree.h(probs)
    
    @staticmethod
    def h(probs):
        """熵"""
        probs = np.asarray(probs)
        assert np.sum(probs) == 1.0 and all(probs >= 0)
        return -np.sum(probs * np.log2(probs))

In [133]:
df = pd.DataFrame({"A": list('aabbccdd'), 'B': list('eeeeffff'), 'y': [1,2,3,3,6,6,6,6]})

In [142]:
# H(D|A) pd.Series
foo = df.drop('y', axis=1).apply(DescitionTree.em_cond_h, y=df.y)
foo

A    0.25
B    0.75
dtype: float64

'B'

In [144]:
10.0 - foo

A    9.75
B    9.25
dtype: float64

In [139]:
# H_A(D)  pd.Series()
df.drop('y', axis=1).apply(DescitionTree.em_h)

A    2.0
B    1.0
dtype: float64

In [135]:
DescitionTree.h([0.1, 0.3, 0.3, 0.3])

1.895461844238322

In [110]:
?pd.DataFrame.apply

[0;31mSignature:[0m [0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m.[0m[0mapply[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mfunc[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mbroadcast[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m [0mraw[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m [0mreduce[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0margs[0m[0;34m=[0m[0;34m([0m[0;34m)[0m[0;34m,[0m [0;34m**[0m[0mkwds[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Applies function along input axis of DataFrame.

Objects passed to functions are Series objects having index
either the DataFrame's index (axis=0) or the columns (axis=1).
Return type depends on whether passed function aggregates, or the
reduce argument if the DataFrame is empty.

Parameters
----------
func : function
    Function to apply to each column/row
axis : {0 or 'index', 1 or 'columns'}, default 0
    * 0 or 'index': apply function to each column
    * 1 or 'columns': apply function to eac

In [102]:
df.columns.drop('C', errors='ignore')

Index(['A', 'B'], dtype='object')

In [104]:
df = pd.DataFrame({"A": list('aaabbbcc'), 'B': range(8)})
df

Unnamed: 0,A,B
0,a,0
1,a,1
2,a,2
3,b,3
4,b,4
5,b,5
6,c,6
7,c,7


In [95]:
df.groupby('A')['B'].apply(lambda s: DescitionTree.em_h(s))

A
a    1.584963
b    1.584963
c    1.000000
Name: B, dtype: float64

In [81]:
df['a'] = list('aabc')
df.a.value_counts() / len(df.a)

a    0.50
b    0.25
c    0.25
Name: a, dtype: float64

In [66]:
a = np.asarray(list('aabbccccaa'))

array([0.4, 0.2, 0.4])

In [49]:
x = [0.1, 0.9]
x = np.array(x)

np.sum(x) == 1

True

In [47]:
all(x >= 0)

True

In [52]:
DescitionTree.h([0.01, 0.99])

0.08079313589591118

In [23]:
import numpy as np

np.argmax([1,4,2,3,5])

4

In [22]:
set(df.columns) - set('y')

{'a', 'b'}

In [9]:
import pandas as pd

In [12]:
df = pd.DataFrame({"a":[1,2,3,3], "b": [7,8,8,9]})
df

Unnamed: 0,a,b
0,1,7
1,2,8
2,3,8
3,3,9


In [17]:
df['a'].unique().tolist()

[1, 2, 3]