In [1]:
import pandas as pd
import numpy as np

$\lambda$

Decision Tree CART:
1. 注意对Tree做预剪枝，需要留意的是最大深度self.max_depth以及最小分裂self.min_splits
2. 将不同样品的feature和label合并成训练集(X, Y)
3. 根据训练集去build tree
    -  Gini Index：calculates the amount of probability of a specific feature that is classified incorrectly when selected randomly. Gini index varies between values 0 and 1, where 0 expresses the purity of Gini Index $= 1 - \Sigma^n_{i=1} (P_i)^2$, Where Pi denotes the probability of an element being classified for a distinct class. Note: the Information Gain is used in ID3, C4.5 algorithms.

In [None]:
class DecisionTree(object):
    def __init__(self, _max_depth, _min_splits):
        self.max_depth = _max_depth
        self.min_splits = _min_splits
    
    def fit(self, _feature, _label):
        self.feature = _feature
        self.label = _label
        self.train_data = np.column_stack((self.feature, self.label))
        self.build_tree()
    
    def compute_gini_similarity(self, groups, class_labels):
        """
        compute the gini index for the groups and class_labels
        
        :param groups: List[List[feature]], groups is a list of list of features in each leaf node
        :param class_labels: List[label], contains all the labels type in true label
        
        """
        num_sample = sum([len(group) for group in groups])
        gini_score = 0
        for group in groups:
            size = float(len(group)) #取出某一leaf node的所有output list
            if size == 0:
                continue
            score = 0.0
            for label in class_labels: #对该group里所有的数进行判断，如果是pure的，那proportion = 1， gini_socre会是0， 越纯的Gini系数越小，越不容易再划分
                proportion = (group[:,-1] == label).sum() / size
                score += proportion * proportion
            gini_score += (1.0 - score) * (size/num_sample) #将目前所有leaf node的gini_score加总
        return gini_score
    def terminal_node(self, _group):
        """
        Function set terminal node as the most common class in the group to make prediction later on
        is an helper function used to mark the leaf node in the tree based on the early stop condition
        or actual stop condition which ever is meet early
        :param _group:
        :return:
        """
        class_labels, count = np.unique(_group[:,-1], return_counts= True)
        return class_labels[np.argmax(count)]
    def split(self, index, val, data):
        data_left = np.array([]).reshape(0, self.train_data.shape[1])
        data_right = np.array([]).reshape(0, self.train_data.shape[1])
        for row in data:
            if row[index] <= val:
                data_left = np.vstack((data_left, row))
            if row[index] > val:
                data_right = np.vstack((data_right, row))
        return data_left, data_right
    def best_split(self, data):
        class_labels = np.unique(data)
        best_index = 999
        best_val = 999
        best_score = 999
        best_groups = None
        for index in range(data.shape[1] - 1):
            for row in data:
                groups = self.split(index, row[index], data)
                gini_score = self.comput_gini_simularity(groups, class_labels)
                if gini_score < best_score:
                    best_score = gini_score
                    best_val = row[index]
                    best_index = index
                    best_groups = groups
        result = {}
        result['index'] = best_index
        result['val'] = best_val
        result['groups'] = best_groups
        return result
    def split_branch(self, node, depth):
        left_node, right_node = node['groups']
        del(node['groups'])
        
        if not isinstance(left_node, np.ndarray) or not isinstance(right_node, np.ndarray):
            node['left'] = self.terminal_node(left_node + right_node)
            node['right'] = self.terminal_node(left_node + right_node)
            return
        if depth >= self.max_depth:
            node['left'] = self.terminal_node(left_node)
            node['right'] = self.terminal_node(right_node)
            return 
        if len(left_node) <= self.min_splits:
            node['left'] = self.terminal_node(left_node)
        else:
            node['left'] = self.best_split(left_node)
            self.split_brach(node['left'], depth + 1)
        if len(right_node) <= self.min_splits:
            node['right'] = self.terminal_node(right_node)
        else:
            node['left'] = self.terminal_node(right_node)
            self.split_brach(node['right'], depth+1)
    
    def build_tree(self):
        self.root = self.best_split(self.train_data)
        self.split_brach(self.root, 1)
        return self.root
    
    def _predit(self, node, row):
        if row[node['index']] < node['val']:
            if isinstance(node['left'], dict):
                return self._predict(node['left'], row)
            else:
                return node['left']
        else:
            if isinstance(node['right'], dict):
                return self._predict(node['right'])
            else:
                return node['right']
    def predict(self, test_data):
        self.predicted_label = np.array([])
        for index in test_data:
            self.predicted_label = np.append(self.predicted_label, self.predict(self.root, index))
        return self.predicted_label

In [7]:
List_train  = [1,2,1,3,4,4,2,3,2,1]
label, count = np.unique(List_train, return_counts = True)
print(label, count)
np.argmax(count)
print(label[np.argmax(count)])

[1 2 3 4] [3 3 2 2]
1


In [4]:
for lab, c in zip(label, count):
    print(lab, c)

1 3
2 3
3 2
4 2


In [11]:
A = [[1,1,0],[1,0,1],[0,0,0]]
for row in range(len(A)//2):
    A[row] , A[len(A) - 1 - row] = A[len(A) - 1 - row][::-1], A[row][::-1]
print(A)

[[0, 0, 0], [1, 0, 1], [0, 1, 1]]


In [12]:
for i in range(0):
    print(i)