实现信息增益的未剪枝算法

In [11]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import math


In [12]:
class Node:
    def __init__(self, root=True, label=None, feature_name=None, feature=None):
        self.root = root
        self.label = label
        self.feature_name = feature_name
        self.feature = feature
        self.tree = {}
        self.result = {"label": self.label,
                       "feature": self.feature,
                       "tree": self.tree}

    def __repr__(self):
        return '{}'.format(self.result)

    def add_node(self, val, node):
        self.tree[val] = node

    def predict(self, features):
        if self.root is True:
            return self.label
        return self.tree[features[self.feature]].predict(features)


class DeciTree:
    def __init__(self, epsilon=0.1):
        self.epsilon = epsilon
        self._tree = {}

    @staticmethod
    def calculate_entropy(datasets):
        length = len(datasets)
        label_count = {}
        for i in range(length):
            label = datasets[i][-1]
            if label not in label_count:
                label_count[label] = 0
            label_count[label] += 1

        entropy = -sum([(p/length)*math.log(p/length, 2)
                       for p in label_count.values()])
        return entropy

    def cal_cond_entropy(self, datasets, axis=0):
        length = len(datasets)
        feature_sets = {}
        for c in range(length):
            feature = datasets[c][axis]
            if feature not in feature_sets:
                feature_sets[feature] = []
            feature_sets[feature].append(datasets[c])

        cond_entropy = sum([(len(p)/length)*self.calculate_entropy(p)
                           for p in feature_sets.values()])

        return cond_entropy

    @staticmethod
    def info_gain(entropy, cond_entropy):
        return entropy-cond_entropy

    def info_gain_train(self, datasets):
        count = len(datasets[0])-1
        entropy = self.calculate_entropy(datasets)
        best_feature = []

        for c in range(count):
            c_info_gain = self.info_gain(
                entropy, self.cal_cond_entropy(datasets, axis=c))
            best_feature.append((c, c_info_gain))

        best_ = max(best_feature, key=lambda x: x[-1])
        return best_

    def train(self, train_data):

        _, y_train, features = train_data.iloc[:, :-1], train_data.iloc[:, -1], train_data.columns[:-1]
        #step1,对应上述ID3算法中的1）
        if len(y_train.value_counts()) == 1:
            return Node(root=True, label=y_train.iloc[0])
        #step2对应上述ID3算法中的2）
        if len(features) == 0:
            return Node(root=True, label=y_train.value_counts().sort_values(ascending=False).index[0])
        #step3 对应上述ID3算法中的3）
        max_feature, max_info_gain = self.info_gain_train(np.array(train_data))
        max_feature_name = features[max_feature]

        #step4 对应上述ID3算法中的4）
        if max_info_gain < self.epsilon:
            return Node(root=True, label=y_train.value_counts().sort_values(ascending=False).index[0])

        #step5 对应上述ID3算法中的5）
        node_tree = Node(
            root=False, feature_name=max_feature_name, feature=max_feature)

        feature_list = train_data[max_feature_name].value_counts().index

        for f in feature_list:
            sub_train_df = train_data.loc[train_data[max_feature_name] == f].drop(
                [max_feature_name], axis=1)
            sub_tree = self.train(sub_train_df)
            node_tree.add_node(f, sub_tree)

        return node_tree

    def fit(self, train_data):
        self._tree = self.train(train_data)
        return self._tree

    def predict(self, x_test):
        return self._tree.predict(x_test)


In [13]:
#下载数据集iris
iris = load_iris()
#print(iris.feature_names)
data,target = iris.data,iris.target
data = pd.DataFrame(data,columns=iris.feature_names)
target = pd.DataFrame(target)
datasets= data.join(target)
datasets

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),0
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [14]:
dt = DeciTree()
tree = dt.fit(datasets)
tree


{'label': None, 'feature': 2, 'tree': {1.5: {'label': 0, 'feature': None, 'tree': {}}, 1.4: {'label': 0, 'feature': None, 'tree': {}}, 5.1: {'label': None, 'feature': 0, 'tree': {5.8: {'label': 2, 'feature': None, 'tree': {}}, 6.3: {'label': 2, 'feature': None, 'tree': {}}, 6.9: {'label': 2, 'feature': None, 'tree': {}}, 5.9: {'label': 2, 'feature': None, 'tree': {}}, 6.5: {'label': 2, 'feature': None, 'tree': {}}, 6.0: {'label': 1, 'feature': None, 'tree': {}}}}, 4.5: {'label': None, 'feature': 0, 'tree': {6.0: {'label': 1, 'feature': None, 'tree': {}}, 4.9: {'label': 2, 'feature': None, 'tree': {}}, 6.2: {'label': 1, 'feature': None, 'tree': {}}, 5.7: {'label': 1, 'feature': None, 'tree': {}}, 5.4: {'label': 1, 'feature': None, 'tree': {}}, 5.6: {'label': 1, 'feature': None, 'tree': {}}, 6.4: {'label': 1, 'feature': None, 'tree': {}}}}, 1.3: {'label': 0, 'feature': None, 'tree': {}}, 1.6: {'label': 0, 'feature': None, 'tree': {}}, 5.6: {'label': 2, 'feature': None, 'tree': {}}, 4.0: 