## Decision Trees

- Simple examples that show the following

    - Train a decision tree classifier

    - Evaluate the model

    - Find important Features in Decision Trees
    
Note that we do not use k-cross validation 


### Set up

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import pandas as pd
import os

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


# to make this notebook's output stable across runs
np.random.seed(42)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "decision_trees"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

### Important features from a decision tree using gini index

- Decision tree classifier

https://scikit-learn.org/stable/modules/tree.html#classification

- Metrics

https://scikit-learn.org/stable/modules/model_evaluation.html

In [None]:
def plot_feature_importances(clf, feature_names):
    c_features = len(feature_names)
    plt.barh(range(c_features), clf.feature_importances_)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature name")
    plt.yticks(np.arange(c_features), feature_names)


## Dataset 1: iris

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()

from IPython.display import Image

Image("images/iris.png")

### Summary of important features

### Train-test split

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# A simple training (1 training)
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 3)

### 1. Important features from a decision tree using information gain measure (entropy)

#### Training the model

In [None]:
clf = DecisionTreeClassifier(criterion='gini')
clf.fit(X_train, y_train)

#### Evaluate the model and show important features

In [None]:
print('Accuracy of DT classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of DT classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

plt.figure(figsize=(8,4), dpi=60)

# call the function above
plot_feature_importances(clf, iris.feature_names)
plt.show()

print('Feature importances: {}'.format(clf.feature_importances_))

### Important features from a decision tree using information gain measure (entropy)

In [None]:
### Train the model

clf2 = DecisionTreeClassifier(criterion='entropy')
clf2.fit(X_train, y_train)

In [None]:
### Evaluate the model and show important features.

In [None]:
print('Accuracy of DT classifier on training set: {:.2f}'
     .format(clf2.score(X_train, y_train)))
print('Accuracy of DT classifier on test set: {:.2f}'
     .format(clf2.score(X_test, y_test)))

plt.figure(figsize=(8,4), dpi=60)
plot_feature_importances(clf2, iris.feature_names)
plt.show()

print('Feature importances: {}'.format(clf2.feature_importances_))

## Dataset 2: Breast Cancer dataset

In [None]:
from sklearn.datasets import load_breast_cancer

# Breast cancer dataset for classification
cancer = load_breast_cancer()
(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)

### Training-testing split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)

### Important features from a decision tree using gini index¶

In [None]:
clf = DecisionTreeClassifier(criterion='gini', max_depth = 4, min_samples_leaf = 8,
                            random_state = 0).fit(X_train, y_train)

print('Accuracy of DT classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of DT classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))


plt.figure(figsize=(10,6),dpi=80)
plot_feature_importances(clf, cancer.feature_names)
plt.tight_layout()

plt.show()

In [None]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth = 4, min_samples_leaf = 8,
                            random_state = 0).fit(X_train, y_train)

print('Accuracy of DT classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of DT classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))


plt.figure(figsize=(10,6),dpi=80)
plot_feature_importances(clf, cancer.feature_names)
plt.tight_layout()

plt.show()

### k-cross validation

#### Instructions: 

<p style='color:red'> 
Please make a summary of the model performance (averaging k folds' results) using result_metrics_dict. Currently it shows the content of the dictionary. 
</p> 

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
iris = load_iris()

X = iris.data[:, 2:] # petal length and width
y = iris.target

from sklearn.model_selection import KFold # import k-fold validation

kf = KFold(n_splits=3, random_state=None, shuffle=True) # Define the split - into 2 folds 

kf.get_n_splits(X) # returns the number of splitting iterations in the cross-validator

tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)

# !!!!! Please make a summary of the model performance (averaging k folds' results) using result_metrics_dict 
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    tree_clf.fit(X_train, y_train)
    y_pred = tree_clf.predict(X_test)
    
    # Print classification report
    target_names = iris.target_names
    result_metrics = classification_report(y_test, y_pred, target_names=target_names)
    
    print(result_metrics)
    
    # Once you get dictionary of metrics, access the values using dictionary
    result_metrics_dict = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
    
    print(result_metrics_dict)