In [49]:
import pandas as pd
import numpy as np
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
import pydot
import random

np.random.seed(42)
#random.seed(42)


class Dataset(object):
    def __init__(self, filename, header=None, set_header=True):
        
        self.filename = filename
        
        self.features_to_id = {}
        self.id_to_features = {}
        
        self.data = self.read_csv(self.filename, header, set_header=True)
        self.init(self.data)
    
    def read_csv(self, filename, header=None, set_header=False):
        
        if set_header:
            data_df = pd.read_csv(filename,header=0)
            data_df.columns = header
        
        else:
            data_df = pd.read_csv(filename,header=0)
    
        return data_df
    
    
    def init(self, data_df):
        
        cols_name = list(data_df.columns.values)
        
        for col in cols_name:
            
            self.features_to_id[col] = {}
            self.id_to_features[col] = {}
            
            data = data_df[col]
            
            names = list(set(list(data.values)))
            
            for i, name in enumerate(names):    
                self.features_to_id[col][name] = i
                self.id_to_features[col][i] = name
                
            
        return self.id_to_features, self.features_to_id
    
    def vectors(self):
        def map_feature_id(x):
            for col in self.features_to_id:
                x[col] = self.features_to_id[col][x[col]]
                
            return x

        return self.data.apply(func=map_feature_id, axis=1).values
    
    @staticmethod
    def split(data, test_size=0.4, random_state=42):
    
        test_index = data.sample(frac=test_size).index
        train_index = np.setdiff1d(data.index.values,test_index.values)

        train = data.loc[test_index]
        test = data.loc[train_index]

        return train, test
    
def preprocess(filename, header):
    
        data_df = pd.read_csv(filename,header=0)
        data_df.columns = header
    
        data_unacc = data_df.loc[data_df[header[-1]]=='unacc']
        train_unacc, test_unacc = Dataset.split(data_unacc)

        data_acc = data_df.loc[data_df[header[-1]] == 'acc']
        train_acc, test_acc = Dataset.split(data_acc)

        data_good = data_df.loc[data_df[header[-1]] == 'good']
        train_good, test_good = Dataset.split(data_good)

        data_vgood = data_df.loc[data_df[header[-1]] == 'vgood']
        train_vgood, test_vgood = Dataset.split(data_vgood)

        train = pd.concat([train_acc, train_unacc, train_good, train_vgood], axis=0).sample(frac=1).reset_index(drop=True)
        test = pd.concat([test_acc, test_unacc, test_good, test_vgood], axis=0).sample(frac=1).reset_index(drop=True)

        X_test = test[header[:-1]]
        y_test = test[header[-1]]

        train.to_csv('train.csv', header=header, index=0)
        X_test.to_csv('test_X.csv', header=header[:-1], index=0)
        y_test.to_csv('test_y.csv', header=header[-1], index=0)
        test.to_csv('test.csv', header=header, index=0)
        


def draw(tree, feature_names=[], class_names=[]):
    
    print(tree)
    
    export_graphviz(tree,out_file="tree.dot",class_names=class_names ,feature_names=feature_names, impurity=False,filled=True)
    
    (graph,) = pydot.graph_from_dot_file('tree.dot')
    
    graph.write_png('tree.png')

    
def train_tree(X_train, y_train, feature_names=[], class_names=[]):

    tree = DecisionTreeClassifier(criterion='entropy')
    tree.fit(X_train,y_train)
    
    print('Train score:{:.3f}'.format(tree.score(X_train,y_train)))
    
    draw(tree, feature_names, class_names)
    
    return tree

def test_tree(tree, X_test, y_test):
    
    print('Test score:{:.3f}'.format(tree.score(X_test,y_test)))
    
if __name__ == "__main__":
    
    filename = 'car.data.txt'
    header =['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
    
    #preprocess(filename, header)
    
    # load train set
    dataset = Dataset(filename, header, set_header=True)
    
    print(dataset.id_to_features)
    
    # train
    train = dataset.vectors()
    X = train[:, :-1]
    y = train[:, -1]
    
    tree = train_tree(X, y, feature_names=header[:-1], class_names=['acc', 'unacc', 'good', 'vgood'])
    
    # test
    test = Dataset('test.csv', header, set_header=True).vectors()
    X_test = test[:, :-1]
    y_test = test[:, -1]
    
    
    test_tree(tree, X_test, y_test)

    n_nodes = tree.tree_.node_count
    children_left = tree.tree_.children_left
    children_right = tree.tree_.children_right
    feature = tree.tree_.feature
    threshold = tree.tree_.threshold
    


    # The tree structure can be traversed to compute various properties such
    # as the depth of each node and whether or not it is a leaf.
    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, -1)]  # seed is the root node id and its parent depth
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()
        node_depth[node_id] = parent_depth + 1

        # If we have a test node
        if (children_left[node_id] != children_right[node_id]):
            stack.append((children_left[node_id], parent_depth + 1))
            stack.append((children_right[node_id], parent_depth + 1))
        else:
            is_leaves[node_id] = True

    print("The binary tree structure has %s nodes and has "
          "the following tree structure:"
          % n_nodes)
    for i in range(n_nodes):
        if is_leaves[i]:
            print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
        else:
            print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
                  "node %s."
                  % (node_depth[i] * "\t",
                     i,
                     children_left[i],
                     header[feature[i]],
                     threshold[i],
                     children_right[i],
                     ))
    print()

    # First let's retrieve the decision path of each sample. The decision_path
    # method allows to retrieve the node indicator functions. A non zero element of
    # indicator matrix at the position (i, j) indicates that the sample i goes
    # through the node j.

    node_indicator = tree.decision_path(X_test)

    # Similarly, we can also have the leaves ids reached by each sample.

    leave_id = tree.apply(X_test)

    # Now, it's possible to get the tests that were used to predict a sample or
    # a group of samples. First, let's make it for the sample.

    sample_id = 0
    node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
                                        node_indicator.indptr[sample_id + 1]]

    print('Rules used to predict sample %s: ' % sample_id)
    for node_id in node_index:
        if leave_id[sample_id] == node_id:
            continue

        if (X_test[sample_id, feature[node_id]] <= threshold[node_id]):
            threshold_sign = "<="
        else:
            threshold_sign = ">"

        print("decision id node %s : (X_test[%s, %s] (= %s) %s %s)"
              % (node_id,
                 sample_id,
                 header[feature[node_id]],
                 X_test[sample_id, feature[node_id]],
                 threshold_sign,
                 threshold[node_id]))

    # For a group of samples, we have the following common node.
    sample_ids = [0, 1]
    common_nodes = (node_indicator.toarray()[sample_ids].sum(axis=0) ==
                    len(sample_ids))

    common_node_id = np.arange(n_nodes)[common_nodes]

    print("\nThe following samples %s share the node %s in the tree"
          % (sample_ids, common_node_id))
    print("It is %s %% of all nodes." % (100 * len(common_node_id) / n_nodes,))






{'buying': {0: 'med', 1: 'vhigh', 2: 'low', 3: 'high'}, 'maint': {0: 'med', 1: 'vhigh', 2: 'low', 3: 'high'}, 'doors': {0: '4', 1: '5more', 2: '2', 3: '3'}, 'persons': {0: '4', 1: '2', 2: 'more'}, 'lug_boot': {0: 'small', 1: 'big', 2: 'med'}, 'safety': {0: 'med', 1: 'low', 2: 'high'}, 'class': {0: 'good', 1: 'acc', 2: 'unacc', 3: 'vgood'}}
Train score:1.000
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Test score:0.540
The binary tree structure has 405 nodes and has the following tree structure:
node=0 test node: go to node 1 if X[:, 5] <= 1.5 else to node 214.
	node=1 test node: go to node 2 if X[:, 5] <= 0.5 else to node 213.
		node=2 test node: go to node 3 if X[:, 4] <= 0.5 else t

KeyError: 5

In [43]:
"""
=========================================
Understanding the decision tree structure
=========================================

The decision tree structure can be analysed to gain further insight on the
relation between the features and the target to predict. In this example, we
show how to retrieve:

- the binary tree structure;
- the depth of each node and whether or not it's a leaf;
- the nodes that were reached by a sample using the ``decision_path`` method;
- the leaf that was reached by a sample using the apply method;
- the rules that were used to predict a sample;
- the decision path shared by a group of samples.

"""
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

tree = DecisionTreeClassifier(max_leaf_nodes=3, random_state=0)
tree.fit(X_train, y_train)

# The decision estimator has an attribute called tree_  which stores the entire
# tree structure and allows access to low level attributes. The binary tree
# tree_ is represented as a number of parallel arrays. The i-th element of each
# array holds information about the node `i`. Node 0 is the tree's root. NOTE:
# Some of the arrays only apply to either leaves or split nodes, resp. In this
# case the values of nodes of the other type are arbitrary!
#
# Among those arrays, we have:
#   - left_child, id of the left child of the node
#   - right_child, id of the right child of the node
#   - feature, feature used for splitting the node
#   - threshold, threshold value at the node
#

# Using those arrays, we can parse the tree structure:

n_nodes = tree.tree_.node_count
children_left = tree.tree_.children_left
children_right = tree.tree_.children_right
feature = tree.tree_.feature
threshold = tree.tree_.threshold


# The tree structure can be traversed to compute various properties such
# as the depth of each node and whether or not it is a leaf.
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)]  # seed is the root node id and its parent depth
while len(stack) > 0:
    node_id, parent_depth = stack.pop()
    node_depth[node_id] = parent_depth + 1

    # If we have a test node
    if (children_left[node_id] != children_right[node_id]):
        stack.append((children_left[node_id], parent_depth + 1))
        stack.append((children_right[node_id], parent_depth + 1))
    else:
        is_leaves[node_id] = True

print("The binary tree structure has %s nodes and has "
      "the following tree structure:"
      % n_nodes)
for i in range(n_nodes):
    if is_leaves[i]:
        print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
    else:
        print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
              "node %s."
              % (node_depth[i] * "\t",
                 i,
                 children_left[i],
                 feature[i],
                 threshold[i],
                 children_right[i],
                 ))
print()

# First let's retrieve the decision path of each sample. The decision_path
# method allows to retrieve the node indicator functions. A non zero element of
# indicator matrix at the position (i, j) indicates that the sample i goes
# through the node j.

node_indicator = tree.decision_path(X_test)

# Similarly, we can also have the leaves ids reached by each sample.

leave_id = tree.apply(X_test)

# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.

sample_id = 0
node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
                                    node_indicator.indptr[sample_id + 1]]

print('Rules used to predict sample %s: ' % sample_id)
for node_id in node_index:
    if leave_id[sample_id] == node_id:
        continue

    if (X_test[sample_id, feature[node_id]] <= threshold[node_id]):
        threshold_sign = "<="
    else:
        threshold_sign = ">"

    print("decision id node %s : (X_test[%s, %s] (= %s) %s %s)"
          % (node_id,
             sample_id,
             feature[node_id],
             X_test[sample_id, feature[node_id]],
             threshold_sign,
             threshold[node_id]))

# For a group of samples, we have the following common node.
sample_ids = [0, 1]
common_nodes = (node_indicator.toarray()[sample_ids].sum(axis=0) ==
                len(sample_ids))

common_node_id = np.arange(n_nodes)[common_nodes]

print("\nThe following samples %s share the node %s in the tree"
      % (sample_ids, common_node_id))
print("It is %s %% of all nodes." % (100 * len(common_node_id) / n_nodes,))


The binary tree structure has 5 nodes and has the following tree structure:
node=0 test node: go to node 1 if X[:, 3] <= 0.800000011920929 else to node 2.
	node=1 leaf node.
	node=2 test node: go to node 3 if X[:, 2] <= 4.950000047683716 else to node 4.
		node=3 leaf node.
		node=4 leaf node.

Rules used to predict sample 0: 
decision id node 0 : (X_test[0, 3] (= 2.4) > 0.800000011920929)
decision id node 2 : (X_test[0, 2] (= 5.1) > 4.950000047683716)

The following samples [0, 1] share the node [0 2] in the tree
It is 40.0 % of all nodes.
