In [85]:
from sklearn import datasets
import numpy as np
from collections import defaultdict
from tqdm import tqdm

In [86]:
iris = datasets.load_iris()
data_ = iris['data']
labels = iris['target']
data_ = np.concatenate((data_,labels.reshape(-1,1)), axis=1)
np.random.shuffle(data_)

In [87]:
class DecisionNode:
    def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
        self.col = col
        self.value = value
        self.results = results
        self.tb = tb
        self.fb = fb


In [88]:
def get_counts(data):
    counts = defaultdict(int)
    for i in range(data.shape[0]):
        counts[data[i,-1]] += 1
    return counts

In [89]:
def entropy(data):
    from math import log
    log2 = lambda x:log(x)/log(2)
    ent = 0.0
    counts = get_counts(data)
    for cls_num in counts.values():
        p = float(cls_num) / data.shape[0]
        ent -= p*log2(p)
    return ent


In [90]:
def gini(data):
    counts = get_counts(data)
    gi = 0.0
    for cls_num in counts.values():
        p = float(cls_num) / data.shape[0]
        gi += p*(1 - p)
    return gi

In [91]:
def divide_set(data, feature, value):
    split_function = lambda row:row[feature] >= value
    set1 = np.array([data[i,...] for i in range(data.shape[0]) if split_function(data[i,...])])
    set2 = np.array([data[i,...] for i in range(data.shape[0]) if not split_function(data[i,...])])

    return set1, set2

In [92]:
def build_tree(data , score_function = entropy):
    if data.shape[0] == 0:return DecisionNode()
    current_score = score_function(data)

    best_gain = 0.0
    best_split = None
    best_sets = None

    for feature in range(data.shape[1]-1):
        for value in range(data.shape[0]):
            value = data[value,feature]
            set1, set2 = divide_set(data, feature, value)
            p = float(set1.shape[0]) / data.shape[0]
            gain = current_score - p*score_function(set1) - (1-p) * score_function(set2)
            if gain > best_gain and set1.shape[0] > 0 and set2.shape[0] > 0:
                best_gain = gain
                best_sets = set1, set2
                best_split = feature, value

    if best_gain > 0:
        ture_branch = build_tree(best_sets[0])
        false_branch = build_tree(best_sets[1])
        return DecisionNode(col=best_split[0], value=best_split[1],tb=ture_branch,fb=false_branch)
    else:
        return DecisionNode(results=get_counts(data))


In [93]:
def print_tree(tree, indent = ""):
    if tree.results is not None:
        print(str(list(tree.results.keys())[0]))
    else:
        print("input[{}]>={}?".format(tree.col, tree.value))
        print(indent+"T-> ",end="")
        print_tree(tree.tb, indent+"\t")
        print(indent+"F-> ",end="")
        print_tree(tree.fb,indent+"\t")


In [94]:
def evaluate_one(observation, tree):
    if tree.results is not None:
        return tree.results
    if observation[tree.col] >= tree.value:
        return evaluate_one(observation, tree.tb)
    else:
        return evaluate_one(observation, tree.fb)

In [95]:
def evaluate(data, tree):
    acc = 0
    for i in range(data.shape[0]):
        if list(evaluate_one(data[i,...], tree).keys())[0] == data[i, -1]:
            acc += 1
    return float(acc)/data.shape[0]
    

In [133]:
def cross_validation(data, k, prune_=0.1):
    size = data.shape[0]//k
    acc = 0.0
    for i in tqdm(range(k)):
        valid = data[i*size: (i+1)*size,...]
        train = np.concatenate((data[: i*size,...],data[(i+1)*size: ,...]), axis=0)
        tree = build_tree(train)
        prune(tree, prune_)
        acc += evaluate(valid, tree)

    acc /= k
    print("{}-Fold validation result is {}".format(k, acc))
    

In [134]:
def prune(tree, min_gain):
    if tree.tb.results is None:
        prune(tree.tb, min_gain)
    if tree.fb.results is None:
        prune(tree.fb, min_gain)

    if tree.fb.results is not None and tree.tb.results is not None:
        tb, fb = [], []
        for v,c in tree.tb.results.items():
            tb+=[[v]]*c
        for v,c in tree.fb.results.items():
            fb+=[[v]]*c
        tb = np.array(tb)
        fb = np.array(fb)
        #检查熵的减少量
        delta = entropy(np.concatenate((tb,fb), axis=0))-(entropy(tb)+entropy(fb)/2)
        if delta < min_gain:
            # 合并分支
            tree.tb,tree.fb = None,None
            tree.results = get_counts(np.concatenate((tb,fb), axis=0))



In [135]:
cross_validation(data_, 7)

100%|██████████| 7/7 [00:00<00:00,  8.72it/s]


7-Fold validation result is 0.9319727891156464
