# Decision tree classifier from scratch

In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import Counter
from sklearn import datasets
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree, export_graphviz
import graphviz
import sys
sys.path.insert(0, 'C:/Users/grzesiek/Documents/Data Science/PycharmProjects/ML_from_scratch/Decision_tree_classyfier')

In [8]:
from decision_tree_cls import entropy, accuracy, Node, DecisionTreeCls

## Toy dataset

In [None]:
X = np.array([[1, 30], [1, 15], [1, 5], [0, 10], [0, 5], [0,15], [0, 20], [0, 25], [0, 30], [0,30]])
y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

In [None]:
clf = DecisionTreeCls(max_depth=3)
clf.fit(X, y)
clf.export_text(['rain', 'time_walk'])

In [None]:
feature_1, feature_2 = np.meshgrid(np.linspace(X[:, 0].min()-0.1, X[:, 0].max()+0.1),
                                   np.linspace(X[:, 1].min()-2, X[:, 1].max())+1)
grid = np.vstack([feature_1.ravel(), feature_2.ravel()]).T
tree = clf.fit(X, y)
y_pred = np.reshape(clf.predict(grid), feature_1.shape)

display = DecisionBoundaryDisplay(xx0=feature_1, xx1=feature_2, response=y_pred, ylabel='time_walk', xlabel='rain')
display.plot()
display.ax_.scatter(X[:, 0], X[:, 1], c=y, edgecolor="black");

In [None]:
# clf_sklearn = DecisionTreeClassifier()
clf_sklearn = clf_sklearn.fit(X, y)

dot_data = export_graphviz(clf_sklearn, out_file=None, 
                      feature_names=['rain', 'time_walk'],  
                      class_names=['not_go', 'go'],  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)
graph

## Breast_cancer dataset

In [None]:
data = datasets.load_breast_cancer()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
clf = DecisionTreeCls(max_depth=3)
clf.fit(X_train, y_train)
clf.export_text(data['feature_names'])

In [None]:
clf.feature_importance(data['feature_names']).sort_values(by='feature_importance', ascending=False).style.bar('feature_importance')

In [None]:
y_pred = clf.predict(X_test)
accuracy(y_test, y_pred)

# Decision tree classifier from sklearn

## Parameters

In [None]:
parameters={
    'criterion':'entropy',# {“gini”, “entropy”, “log_loss”}
    'splitter':'best', # {“best”, “random”}
    'max_depth': 3,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'min_weight_fraction_leaf': 0.0,
    'max_features': None, # int, float or {“auto”, “sqrt”, “log2”}, default=None
    'random_state': None, # int, RandomState instance or None, default=None
    'max_leaf_nodes': None, # int, default=None
    'min_impurity_decrease': 0.0, # float, default=0.0
    'class_weight': None, # dict, list of dict or “balanced”, default=None
    'ccp_alpha': 0.0, # non-negative float, default=0.0
}

clf = DecisionTreeClassifier(
    criterion=parameters['criterion'],
    splitter=parameters['splitter'], 
    max_depth=parameters['max_depth'],
    min_samples_split=parameters['min_samples_split'],
    min_samples_leaf=parameters['min_samples_leaf'],
    min_weight_fraction_leaf=parameters['min_weight_fraction_leaf'],
    max_features=parameters['max_features'],
    random_state=parameters['random_state'],
    max_leaf_nodes=parameters['max_leaf_nodes'],
    min_impurity_decrease=parameters['min_impurity_decrease'],
    class_weight=parameters['class_weight'],
    ccp_alpha=parameters['ccp_alpha']
    )
clf = clf.fit(X_train, y_train)

## Attributes

In [None]:
# attributes:
atts = {att for att in dir(clf) if (att[-1]=='_') & (att[0]!='_')}
print('Attributes of tree:', atts)

In [None]:
# feature importance
df=pd.DataFrame(np.transpose([clf.feature_importances_]), index=data['feature_names'], columns=['feature_importance'])

filt = df['feature_importance']>0
(df[filt]).sort_values(by='feature_importance', ascending=False).style.bar("feature_importance")

In [None]:
# scalar attributes
for att, val in clf.__dict__.items():
    if att in atts-{'tree_', 'feature_importances_'}:
        print(att,'=', val)

In [None]:
# tree attributes
print(dir(clf.tree_))

## Methods

In [None]:
meths = ['apply', 
         'cost_complexity_pruning_path',
         'decision_path',
         'get_depth', 
         'get_n_leaves',
         'get_params',
         'predict',
         'predict_log_proba',
         'predict_proba',
         'score',
         'set_params']
meths

In [None]:
clf.apply(X_test[[2],:])

In [None]:
print(clf.decision_path(X_test[[2],:]))

In [None]:
clf.get_depth()

In [None]:
clf.get_n_leaves()

In [None]:
clf.get_params()

In [None]:
clf.predict(X_test[[2],:])

In [None]:
clf.predict_log_proba(X_test[[2],:])

In [None]:
clf.predict_proba(X_test[[2, 3],:])

In [None]:
clf.score(X_test,y_test)

## Display

In [None]:
text = export_text(clf, feature_names=list(data['feature_names']))
print(text)

In [None]:
plot_tree(clf);

In [None]:
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin'

In [None]:
dot_data = export_graphviz(clf, out_file=None, 
                      feature_names=data.feature_names,  
                      class_names=data.target_names,  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)
graph