# Decision Tree Exercises

In [6]:
#data manipulation
import pandas as pd
import numpy as np

#visualization
import matplotlib.pyplot as plt
import seaborn as sns

#stats is great
from scipy import stats

#my own files with my own functions
import acquire
import prepare

# os is operating system stuff, few things I know
# env is my py file to access SQL databases
import os
import env

# If I decide to retrieve other datasets but they'll be raw
from pydataset import data

# ML stuff: (modeling imports)
from sklearn.model_selection import train_test_split

# The big 4 for classification
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression #logistic not linear!
from sklearn.neighbors import KNeighborsClassifier #pick the classifier one

# Evaluation metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

Using the `titanic data`, in your classification-exercises repository, create a notebook, `decision_tree.ipynb` where you will do the following:

In [7]:
df = acquire.get_titanic_data()
df.head()

this file exists, reading csv


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [8]:
df = prepare.clean_titanic(df)
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,Southampton,1


In [9]:
train, validate, test = prepare.splitting_data(df, 'survived', seed=123)

print(train.shape)
print(validate.shape)
print(test.shape)
train.head()

(427, 10)
(142, 10)
(143, 10)


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
702,702,0,3,female,18.0,0,1,14.4542,Cherbourg,0
199,199,0,2,female,24.0,0,0,13.0,Southampton,1
108,108,0,3,male,38.0,0,0,7.8958,Southampton,1
872,872,0,1,male,33.0,0,0,5.0,Southampton,1
827,827,1,2,male,1.0,0,2,37.0042,Cherbourg,0


In [10]:
def preprocess_titanic(train_df, val_df, test_df):
    '''
    preprocess_titanic will take our 3 split df I did from the Titanic,
    which are clean also (see documentation on acquire.py and prepare.py)
    
    output:
    encoded, ML-ready versions of our clean data, with the sex and embark_town
    columns fully encoded in the one-hot fashion return: (pd.DataFrame, pd.DataFrame, pd.DataFrame)
    we get three df's basically.
    '''
    # with a looping structure:
    # for df in [train_df, val_df, test_df]:
    #    df.drop(blah blah blah)
    #    df['plcass'] = df['pclass'].astype(int)
    train_df = train_df.drop(columns = 'passenger_id')
    train_df['pclass'] = train_df['pclass'].astype(int)
    
    val_df = val_df.drop(columns = 'passenger_id')
    val_df['pclass'] = val_df['pclass'].astype(int)
    
    test_df = test_df.drop(columns = 'passenger_id')
    test_df['pclass'] = test_df['pclass'].astype(int)
    
    encoding_var = ['sex', 'embark_town']
    encoded_dfs = []
    for df in [train_df, val_df, test_df]:
        df_encoded_cats = pd.get_dummies(
            df[encoding_var], drop_first = True).astype(int)
        encoded_dfs.append(pd.concat(
            [df, df_encoded_cats], axis=1).drop(columns = encoding_var))
    return encoded_dfs

In [11]:
train, validate, test = preprocess_titanic(train, validate, test)
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
702,0,3,18.0,0,1,14.4542,0,0,0,0
199,0,2,24.0,0,0,13.0,1,0,0,1
108,0,3,38.0,0,0,7.8958,1,1,0,1
872,0,1,33.0,0,0,5.0,1,1,0,1
827,1,2,1.0,0,2,37.0042,0,1,0,0


In [12]:
#all are numerical, clean and fully encoded!
train.dtypes 

survived                     int64
pclass                       int64
age                        float64
sibsp                        int64
parch                        int64
fare                       float64
alone                        int64
sex_male                     int64
embark_town_Queenstown       int64
embark_town_Southampton      int64
dtype: object

In [13]:
### We want everything EXCEPT the target variable
X_train = train.drop(columns = 'survived')
X_validate = validate.drop(columns = 'survived')
X_test = test.drop(columns = 'survived')

In [14]:
### We want ONLY the target variable
y_train = train.survived
y_validate = validate.survived
y_test = test.survived

## 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [15]:
### baseline prediction is not surviving (Perished)
df.survived.mode()

0    0
Name: survived, dtype: int64

In [16]:
### baseline accuracy is ~60%
### meaning if we choose the mode, we'd be right 60% of the time
(df.survived == 0).mean()

0.5955056179775281

## 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [17]:
# create the object
tree = DecisionTreeClassifier() ###here is where max_depth would go
tree

In [18]:
# fit the object
tree.fit(X_train, y_train) #fit only the TRAIN DATA!

In [41]:
# store my predicted values
y_pred = tree.predict(X_train)
print(y_pred.shape)

y_pred[:10]

(427,)


array([0, 0, 0, 0, 1, 1, 1, 0, 0, 0])

## 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [20]:
#accuracy score
tree.score(X_train, y_train)

0.990632318501171

In [21]:
#not misty's favorite
confusion_matrix(y_train, y_pred) #rows, columns

array([[254,   0],
       [  4, 169]])

In [22]:
# Misty preferred
# columns are predicted, rows are actual values
counts = pd.crosstab(y_train, y_pred, rownames=['actual'], colnames=['pred'])
counts

pred,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,254,0
1,4,169


In [23]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       254
           1       1.00      0.98      0.99       173

    accuracy                           0.99       427
   macro avg       0.99      0.99      0.99       427
weighted avg       0.99      0.99      0.99       427



## 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

positive = 1 (survived)

In [24]:
# columns are predicted, rows are actual values
counts = pd.crosstab(y_train, y_pred, rownames=['actual'], colnames=['pred'])
counts

pred,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,254,0
1,4,169


In [25]:
tp = counts.iloc[1,1]
tn = counts.iloc[0,0]
fp = counts.iloc[0,1]
fn = counts.iloc[1,0]

In [26]:
tp, tn, fp, fn

(169, 254, 0, 4)

In [27]:
def compute_class_metrics(y_train, y_pred):
    
    counts = pd.crosstab(y_train, y_pred)
    TP = counts.iloc[1,1]
    TN = counts.iloc[0,0]
    FP = counts.iloc[0,1]
    FN = counts.iloc[1,0]
    
    
    all_ = (TP + TN + FP + FN)

    accuracy = (TP + TN) / all_

    TPR = recall = TP / (TP + FN)
    FPR = FP / (FP + TN)

    TNR = TN / (FP + TN)
    FNR = FN / (FN + TP)

    precision =  TP / (TP + FP)
    f1 =  2 * ((precision * recall) / ( precision + recall))

    support_pos = TP + FN
    support_neg = FP + TN
    
    print(f"Accuracy: {accuracy}\n")
    print(f"True Positive Rate/Sensitivity/Recall/Power: {TPR}")
    print(f"False Positive Rate/False Alarm Ratio/Fall-out: {FPR}")
    print(f"True Negative Rate/Specificity/Selectivity: {TNR}")
    print(f"False Negative Rate/Miss Rate: {FNR}\n")
    print(f"Precision/PPV: {precision}")
    print(f"F1 Score: {f1}\n")
    print(f"Support (0): {support_pos}")
    print(f"Support (1): {support_neg}")

In [30]:
compute_class_metrics(y_train, y_pred)

Accuracy: 0.990632318501171

True Positive Rate/Sensitivity/Recall/Power: 0.976878612716763
False Positive Rate/False Alarm Ratio/Fall-out: 0.0
True Negative Rate/Specificity/Selectivity: 1.0
False Negative Rate/Miss Rate: 0.023121387283236993

Precision/PPV: 1.0
F1 Score: 0.9883040935672515

Support (0): 173
Support (1): 254


In [24]:
### Kelsey found that normalize = True gives us metrics on the rates the question asks for
### Misty found that normalize = 'index' is the way that it'll calculate it correctly! very handy, originally made a function

pd.crosstab(y_train, y_pred, normalize='index')

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.0,0.0
1,0.023121,0.976879


## 5. Run through steps 2-4 using a different max_depth value.

In [25]:
for x in range(1, 21):
    print(x)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20


In [26]:
for x in range(1, 21):
    #create the object
    tree = DecisionTreeClassifier(max_depth=x)

    #fit the object
    tree.fit(X_train, y_train) #fit only the TRAIN DATA!
    
    acc = tree.score(X_train, y_train)
    
    print(f'with a max depth of {x}, the accuracy is {round(acc,2)}')

with a max depth of 1, the accuracy is 0.79
with a max depth of 2, the accuracy is 0.8
with a max depth of 3, the accuracy is 0.81
with a max depth of 4, the accuracy is 0.84
with a max depth of 5, the accuracy is 0.85
with a max depth of 6, the accuracy is 0.87
with a max depth of 7, the accuracy is 0.89
with a max depth of 8, the accuracy is 0.92
with a max depth of 9, the accuracy is 0.93
with a max depth of 10, the accuracy is 0.95
with a max depth of 11, the accuracy is 0.96
with a max depth of 12, the accuracy is 0.97
with a max depth of 13, the accuracy is 0.98
with a max depth of 14, the accuracy is 0.99
with a max depth of 15, the accuracy is 0.99
with a max depth of 16, the accuracy is 0.99
with a max depth of 17, the accuracy is 0.99
with a max depth of 18, the accuracy is 0.99
with a max depth of 19, the accuracy is 0.99
with a max depth of 20, the accuracy is 0.99


## 6. Which model performs better on your in-sample data?

> model with max depth of 14 is best!

## 7. Which model performs best on your out-of-sample data, the validate set?

In [27]:
for x in range(1, 14):
    #create the object
    tree = DecisionTreeClassifier(max_depth=x)

    #fit the object
    tree.fit(X_train, y_train) #fit only the TRAIN DATA!
    
    #calculate the accuracy for train
    acc = tree.score(X_train, y_train)
    
    #calculate the accuracy for validate
    acc_v = tree.score(X_validate, y_validate)

    
    print(f'with a max depth of {x}, the accuracy train = {round(acc,2)}), val = {round(acc_v,2)}')

with a max depth of 1, the accuracy train = 0.79), val = 0.77
with a max depth of 2, the accuracy train = 0.8), val = 0.79
with a max depth of 3, the accuracy train = 0.81), val = 0.8
with a max depth of 4, the accuracy train = 0.84), val = 0.83
with a max depth of 5, the accuracy train = 0.85), val = 0.8
with a max depth of 6, the accuracy train = 0.87), val = 0.77
with a max depth of 7, the accuracy train = 0.89), val = 0.77
with a max depth of 8, the accuracy train = 0.91), val = 0.73
with a max depth of 9, the accuracy train = 0.93), val = 0.74
with a max depth of 10, the accuracy train = 0.94), val = 0.75
with a max depth of 11, the accuracy train = 0.96), val = 0.72
with a max depth of 12, the accuracy train = 0.97), val = 0.75
with a max depth of 13, the accuracy train = 0.98), val = 0.72


> many models overfit, the max depth of 4 performs best!

### Telco Dataset (optional)

#### 1. Work through these same exercises using the Telco dataset.

#### 2. Experiment with this model on other datasets with a higher number of output classes.