# Decision tree classifier from scratch

In [2]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn import datasets
from sklearn.model_selection import train_test_split
import sys
sys.path.insert(0, 'C:/Users/grzesiek/Documents/Data Science/PycharmProjects/ML_from_scratch/Decision_tree_clf')

In [3]:
from decision_tree_clf import entropy, accuracy, Node, DecisionTreeCls

In [4]:
data = datasets.load_breast_cancer()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
clf = DecisionTreeCls(max_depth=3)
clf.fit(X_train, y_train)
clf.export_text(data['feature_names'])

Root node: n_samples = 455, 
            splitting_feature = mean concave points, 
            splitting_threshold = 0.05102
|--- Node right: n_samples = 178, 
            splitting_feature = worst perimeter, 
            splitting_threshold = 114.3
|   |--- Leaf node right: 0
|   |--- Node left: n_samples = 42, 
            splitting_feature = worst texture, 
            splitting_threshold = 25.47
|   |   |--- Leaf node right: 0
|   |   |--- Leaf node left: 1
|--- Node left: n_samples = 277, 
            splitting_feature = worst area, 
            splitting_threshold = 760.2
|   |--- Node right: n_samples = 38, 
            splitting_feature = mean texture, 
            splitting_threshold = 18.77
|   |   |--- Leaf node right: 0
|   |   |--- Leaf node left: 1
|   |--- Node left: n_samples = 239, 
            splitting_feature = area error, 
            splitting_threshold = 44.96
|   |   |--- Leaf node right: 1
|   |   |--- Leaf node left: 1


In [6]:
y_pred = clf.predict(X_test)
accuracy(y_test, y_pred)

0.9210526315789473

# Decision tree classifier from sklearn

## Parameters

In [1]:
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree, export_graphviz
import graphviz

In [7]:
parameters={
    'criterion':'entropy',# {“gini”, “entropy”, “log_loss”}
    'splitter':'best', # {“best”, “random”}
    'max_depth': 5,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'min_weight_fraction_leaf': 0.0,
    'max_features': None, # int, float or {“auto”, “sqrt”, “log2”}, default=None
    'random_state': None, # int, RandomState instance or None, default=None
    'max_leaf_nodes': None, # int, default=None
    'min_impurity_decrease': 0.0, # float, default=0.0
    'class_weight': None, # dict, list of dict or “balanced”, default=None
    'ccp_alpha': 0.0, # non-negative float, default=0.0
}

clf = DecisionTreeClassifier(
    criterion=parameters['criterion'],
    splitter=parameters['splitter'], 
    max_depth=parameters['max_depth'],
    min_samples_split=parameters['min_samples_split'],
    min_samples_leaf=parameters['min_samples_leaf'],
    min_weight_fraction_leaf=parameters['min_weight_fraction_leaf'],
    max_features=parameters['max_features'],
    random_state=parameters['random_state'],
    max_leaf_nodes=parameters['max_leaf_nodes'],
    min_impurity_decrease=parameters['min_impurity_decrease'],
    class_weight=parameters['class_weight'],
    ccp_alpha=parameters['ccp_alpha']
    )
clf = clf.fit(X_train, y_train)

## Attributes

In [8]:
# attributes:
atts = {att for att in dir(clf) if (att[-1]=='_') & (att[0]!='_')}
print('Attributes of tree:', atts)

Attributes of tree: {'feature_importances_', 'n_features_in_', 'tree_', 'classes_', 'n_classes_', 'max_features_', 'n_features_', 'n_outputs_'}


In [15]:
# feature importance
df=pd.DataFrame(np.transpose([clf.feature_importances_]), index=data['feature_names'], columns=['feature_importance'])
(df[df['feature_importance']>0]).sort_values(by='feature_importance', ascending=False).style.bar("feature_importance")

Unnamed: 0,feature_importance
mean concave points,0.652594
worst perimeter,0.11152
worst texture,0.103498
worst area,0.074447
mean symmetry,0.015175
worst smoothness,0.012761
area error,0.012341
perimeter error,0.00843
smoothness error,0.004617
worst concavity,0.004617


In [10]:
# scalar attributes
for att, val in clf.__dict__.items():
  if att in atts-{'tree_', 'feature_importances_'}:
    print(att,'=', val)

n_features_in_ = 30
n_outputs_ = 1
classes_ = [0 1]
n_classes_ = 2
max_features_ = 30


In [11]:
# tree attributes
print(dir(clf.tree_))

['__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', 'apply', 'capacity', 'children_left', 'children_right', 'compute_feature_importances', 'compute_partial_dependence', 'decision_path', 'feature', 'impurity', 'max_depth', 'max_n_classes', 'n_classes', 'n_features', 'n_leaves', 'n_node_samples', 'n_outputs', 'node_count', 'predict', 'threshold', 'value', 'weighted_n_node_samples']


## Methods

In [None]:
meths = ['apply', 
         'cost_complexity_pruning_path',
         'decision_path',
         'get_depth', 
         'get_n_leaves',
         'get_params',
         'predict',
         'predict_log_proba',
         'predict_proba',
         'score',
         'set_params']
meths

In [None]:
clf.apply(X_test[[2],:])

In [None]:
print(clf.decision_path(X_test[[2],:]))

In [None]:
clf.get_depth()

In [None]:
clf.get_n_leaves()

In [None]:
clf.get_params()

In [None]:
clf.predict(X_test[[2],:])

In [None]:
clf.predict_log_proba(X_test[[2],:])

In [None]:
clf.predict_proba(X_test[[2, 3],:])

In [None]:
clf.score(X_test,y_test)

## Display

In [None]:
text = export_text(clf, feature_names=list(data['feature_names']))
print(text)

In [None]:
plot_tree(clf);

In [None]:
dot_data = export_graphviz(clf, out_file=None, 
                      feature_names=data.feature_names,  
                      class_names=data.target_names,  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)
graph