## 决策树

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter
import math
from math import log
import pprint

### 1.获取数据

In [3]:
def create_data():
    datasets = [['青年', '否', '否', '一般', '否'],
               ['青年', '否', '否', '好', '否'],
               ['青年', '是', '否', '好', '是'],
               ['青年', '是', '是', '一般', '是'],
               ['青年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '好', '否'],
               ['中年', '是', '是', '好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '好', '是'],
               ['老年', '是', '否', '好', '是'],
               ['老年', '是', '否', '非常好', '是'],
               ['老年', '否', '否', '一般', '否'],
               ]
    labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']
    # 返回数据集和每个维度的名称
    return datasets, labels

In [4]:
data, label = create_data()
data

[['青年', '否', '否', '一般', '否'],
 ['青年', '否', '否', '好', '否'],
 ['青年', '是', '否', '好', '是'],
 ['青年', '是', '是', '一般', '是'],
 ['青年', '否', '否', '一般', '否'],
 ['中年', '否', '否', '一般', '否'],
 ['中年', '否', '否', '好', '否'],
 ['中年', '是', '是', '好', '是'],
 ['中年', '否', '是', '非常好', '是'],
 ['中年', '否', '是', '非常好', '是'],
 ['老年', '否', '是', '非常好', '是'],
 ['老年', '否', '是', '好', '是'],
 ['老年', '是', '否', '好', '是'],
 ['老年', '是', '否', '非常好', '是'],
 ['老年', '否', '否', '一般', '否']]

### 2.数据展示

In [5]:
df = pd.DataFrame(data, index=np.arange(1, 16) ,columns=label)

In [6]:
df

Unnamed: 0,年龄,有工作,有自己的房子,信贷情况,类别
1,青年,否,否,一般,否
2,青年,否,否,好,否
3,青年,是,否,好,是
4,青年,是,是,一般,是
5,青年,否,否,一般,否
6,中年,否,否,一般,否
7,中年,否,否,好,否
8,中年,是,是,好,是
9,中年,否,是,非常好,是
10,中年,否,是,非常好,是


In [7]:
#每一个特征的数据分析
df.describe(include=["object"])

Unnamed: 0,年龄,有工作,有自己的房子,信贷情况,类别
count,15,15,15,15,15
unique,3,2,2,3,2
top,老年,否,否,好,是
freq,5,10,9,6,9


In [10]:
#保存数据
df.to_csv("./data/贷款.csv")

### 3.计算信息增益

In [11]:
#计算每一个属性的条件信息增益

def cal_ent(dataset):
    """计算熵"""
    
    data_length = len(dataset)
    label_count = {}
    
    #遍历实例
    for i in range(data_length):
        label = dataset[i][-1]
        
        #计算label的种类
        if label not in label_count:
            label_count[label] = 0
        label_count[label] += 1
        
    #计算熵
    ent = -sum([(p/data_length) * (log(p/data_length, 2)) for p in label_count.values()])
    
    return ent


def cond_ent(dataset, axis=0):
    """计算条件经验熵"""
    
    data_length = len(dataset)
    feature_sets = {} # 统计每一个属性可能取值的数目
    
    for i in range(data_length):
        feature = dataset[i][axis]
        
        if feature not in feature_sets:
            feature_sets[feature] = []
        feature_sets[feature].append(dataset[i])
        
    cond_ent = sum([(len(p) /data_length) *cal_ent(p) for p in feature_sets.values()])
    
    return cond_ent

def info_gain(ent, cond_ent):
    """计算信息增益"""
    
    return ent - cond_ent

def info_gain_train(dataset):
    """获得每一个属性的信息增益"""

    feature_count = len(dataset[0]) - 1
    ent = cal_ent(dataset) # 整体的熵
    
    best_feature = []
    
    for c in range(feature_count):
        c_info_gain = info_gain(ent, cond_ent(dataset, axis = c))
        best_feature.append((c, c_info_gain))
        print("特征:{}, 信息增益：{}".format(label[c], c_info_gain))
    
    best_ = max(best_feature, key=lambda x : x[-1])
    print("特征{}的信息增益最大，选择作为根节点".format(label[best_[0]]))

In [12]:
info_gain_train(np.array(data))

特征:年龄, 信息增益：0.08300749985576883
特征:有工作, 信息增益：0.32365019815155627
特征:有自己的房子, 信息增益：0.4199730940219749
特征:信贷情况, 信息增益：0.36298956253708536
特征有自己的房子的信息增益最大，选择作为根节点


### 4. ID3算法（以信息增益作为根节点的划分准则）

In [29]:
# 定义节点类 二叉树
class Node:
    def __init__(self, root=True, label=None, feature_name=None, feature=None):
        self.root = root
        self.label = label
        self.feature_name = feature_name
        self.feature = feature
        self.tree = {}
        self.result = {
            'label:': self.label,
            'feature': self.feature,
            'tree': self.tree
        }

    def __repr__(self):
        return '{}'.format(self.result)

    def add_node(self, val, node):
        self.tree[val] = node

    def predict(self, features):
        if self.root is True:
            return self.label
        return self.tree[features[self.feature]].predict(features)
        
class DTree:
    def __init__(self, epsilon=0.1):
        self.epsilon = epsilon
        self._tree = {}

    # 熵
    @staticmethod
    def calc_ent(datasets):
        data_length = len(datasets)
        label_count = {}
        for i in range(data_length):
            label = datasets[i][-1]
            if label not in label_count:
                label_count[label] = 0
            label_count[label] += 1
        ent = -sum([(p / data_length) * log(p / data_length, 2)
                    for p in label_count.values()])
        return ent

    # 经验条件熵
    def cond_ent(self, datasets, axis=0):
        data_length = len(datasets)
        feature_sets = {}
        for i in range(data_length):
            feature = datasets[i][axis]
            if feature not in feature_sets:
                feature_sets[feature] = []
            feature_sets[feature].append(datasets[i])
        cond_ent = sum([(len(p) / data_length) * self.calc_ent(p)
                        for p in feature_sets.values()])
        return cond_ent

    # 信息增益
    @staticmethod
    def info_gain(ent, cond_ent):
        return ent - cond_ent

    def info_gain_train(self, datasets):
        count = len(datasets[0]) - 1
        ent = self.calc_ent(datasets)
        best_feature = []
        for c in range(count):
            c_info_gain = self.info_gain(ent, self.cond_ent(datasets, axis=c))
            best_feature.append((c, c_info_gain))
        # 比较大小
        best_ = max(best_feature, key=lambda x: x[-1])
#         print("best :",best_)
        return best_
    
    
    def train(self, train_data):
        """
        数据集的收集为pd
        """
        _, y_train, features = train_data.iloc[:,
                                               :-1], train_data.iloc[:,                                                     							-1],train_data.columns[:-1]
#         print("y_train :\n", y_train)
#         print("features :\n", features)
#         print("train_data :\n", train_data)
#         print("**"*18)
        
        #1.判断是否单个节点
        if len(y_train.value_counts()) == 1:
            return Node(root = True, label= y_train.iloc[0])
        #2.是否为空节点
        if len(features) == 0:
            return Node(root = True,
            label = y_train.value_counts().sort_values(
            ascending = False).index[0])
        #a = y_train.value_counts()
        #b = a.sort_values(ascending  = False)#降序
        #c = b.index[0]　＃取值
     
        #3.创建节点，选择信息增益最大的特征作为节点
        #3.１得到信息增益最大特征信息
        max_feature, max_info_gain = self.info_gain_train(np.array(train_data))
        max_feature_name = features[max_feature]
        
        #4.判断是否小于阀值
        if max_info_gain < self.epsilon:
            return Node(root = True,
                       label=y_train.value_counts().sort_values(
                       ascending=False).index[0])
        #5.构建Ag子集
        node_tree = Node(
            root=False, feature_name=max_feature_name, feature=max_feature)

        feature_list = train_data[max_feature_name].value_counts().index
#         print("feature_list :",feature_list)
        for f in feature_list:
            sub_train_df = train_data.loc[train_data[max_feature_name] ==
                                          f].drop([max_feature_name], axis=1)
        
#             print("sub_train_df :", sub_train_df)
            # 6, 递归生成树
            sub_tree = self.train(sub_train_df)
            node_tree.add_node(f, sub_tree)

        # pprint.pprint(node_tree.tree)
        return node_tree
    
    def fit(self, train_data):
        self._tree = self.train(train_data)
        return self._tree
    def predict(self, X_test):
        return self._tree.predict(X_test)     
    
    
if __name__ == "__main__":
    datasets, labels = create_data()
    data_df = pd.DataFrame(datasets, columns=labels)
    dt = DTree()
    tree = dt.fit(data_df)
    print("tree:\n",tree)
    print("测试结果：", end="")
    print(dt.predict(["老年","否", "否", "一般"]))


tree:
 {'label:': None, 'feature': 2, 'tree': {'否': {'label:': None, 'feature': 1, 'tree': {'否': {'label:': '否', 'feature': None, 'tree': {}}, '是': {'label:': '是', 'feature': None, 'tree': {}}}}, '是': {'label:': '是', 'feature': None, 'tree': {}}}}
测试结果：否


### 5. C4.5算法（以信息增益比作为划分标准）

In [30]:
# 定义节点类 二叉树
class Node:
    def __init__(self, root=True, label=None, feature_name=None, feature=None):
        self.root = root
        self.label = label
        self.feature_name = feature_name
        self.feature = feature
        self.tree = {}
        self.result = {
            'label:': self.label,
            'feature': self.feature,
            'tree': self.tree
        }

    def __repr__(self):
        return '{}'.format(self.result)

    def add_node(self, val, node):
        self.tree[val] = node

    def predict(self, features):
        if self.root is True:
            return self.label
        return self.tree[features[self.feature]].predict(features)

class DTree:
    def __init__(self, epsilon=0.1):
        self.epsilon = epsilon
        self._tree = {}

    # 熵
    @staticmethod
    def calc_ent(datasets):
        data_length = len(datasets)
        label_count = {}
        for i in range(data_length):
            label = datasets[i][-1]
            if label not in label_count:
                label_count[label] = 0
            label_count[label] += 1
        ent = -sum([(p / data_length) * log(p / data_length, 2)
                    for p in label_count.values()])
        return ent

    # 经验条件熵
    def cond_ent(self, datasets, axis=0):
        data_length = len(datasets)
        feature_sets = {}
        for i in range(data_length):
            feature = datasets[i][axis]
            if feature not in feature_sets:
                feature_sets[feature] = []
            feature_sets[feature].append(datasets[i])
        cond_ent = sum([(len(p) / data_length) * self.calc_ent(p)
                        for p in feature_sets.values()])
        sub_ent = -sum([len(p) / data_length * log(len(p)/data_length
                                                  ,2) for p in feature_sets.values()])
        return cond_ent, sub_ent

    # 信息增益
    @staticmethod
    def info_gain(ent, cond_ent, sub_ent):
        #print((ent - cond_ent)/sub_ent)
        return (ent - cond_ent)/sub_ent

    def info_gain_train(self, datasets):
        count = len(datasets[0]) - 1
        ent = self.calc_ent(datasets)
        best_feature = []
        for c in range(count):
#             print(c)
            cond_ent, sub_ent = self.cond_ent(datasets, axis=c)
            c_info_gain = self.info_gain(ent, cond_ent, sub_ent)
            best_feature.append((c, c_info_gain))
#             print("特征:%s, 信息增益%f" %(labels[c], c_info_gain))
        # 比较大小
        best_ = max(best_feature, key=lambda x: x[-1])
        
        return best_
    
    
    def train(self, train_data):
        """
        数据集的收集为pd
        """
        _, y_train, features = train_data.iloc[:,
                                               :-1], train_data.iloc[:,
                                                                 -1],train_data.columns[:-1]
        #print("y_train :\n", y_train)
        #print("features :\n", features)
        #print("train_data :\n", train_data)
        #print("**"*18)
        #1.判断是否单个节点
        if len(y_train.value_counts()) == 1:
            return Node(root = True, label= y_train.iloc[0])
        #2.是否为空节点
        if len(features) == 0:
            return Node(root = True,
            label = y_train.value_counts().sort_values(
            ascending = False).index[0])
        #a = y_train.value_counts()
        #b = a.sort_values(ascending  = False)#降序
        #c = b.index[0]　＃取值
        
        
        
        #3.创建节点，选择信息增益最大的特征作为节点
        #3.１得到信息增益最大特征信息
        max_feature, max_info_gain = self.info_gain_train(np.array(train_data))
        max_feature_name = features[max_feature]
        
        #4.判断是否小于阀值
        if max_info_gain < self.epsilon:
            return Node(root = True,
                       label=y_train.value_counts().sort_values(
                       ascending=False).index[0])
        #5.构建Ag子集
        node_tree = Node(
            root=False, feature_name=max_feature_name, feature=max_feature)

        feature_list = train_data[max_feature_name].value_counts().index
#         print("feature_list :",feature_list)
        for f in feature_list:
            sub_train_df = train_data.loc[train_data[max_feature_name] ==
                                          f].drop([max_feature_name], axis=1)
        
            #print("sub_train_df :", sub_train_df)
            # 6, 递归生成树
            sub_tree = self.train(sub_train_df)
            node_tree.add_node(f, sub_tree)

        # pprint.pprint(node_tree.tree)
        return node_tree
    
    def fit(self, train_data):
        self._tree = self.train(train_data)
        return self._tree
    def predict(self, X_test):
        return self._tree.predict(X_test)
    
    
    if __name__ == "__main__":
        datasets,labels = create_data()
        data_df = pd.DataFrame(datasets, columns=labels)
        dt = DTree()
        tree = dt.fit(data_df)
#         print("tree:\n",tree)
        print("测试结果：", end="")
        print(dt.predict(["老年","否", "否", "一般"]))


测试结果：否
