# Exercise 
- Using the titanic data, in your classification-exercises repository, create a notebook, decision_tree.ipynb where you will do the following:

In [98]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split


from pydataset import data
from prepare import *

In [8]:
df = prep_titanic()

In [23]:
train, validate, test = train_test_validate(df, 'survived')

#### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [24]:
# "0" for not survived is the mode
train.survived.value_counts()


0    307
1    191
Name: survived, dtype: int64

In [26]:
# set baseline column in dataframe 
train['survived_baseline'] = 0
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton,survived_baseline
583,583,0,1,male,36.0,0,0,40.125,Cherbourg,1,1,0,0,0
22,22,1,3,female,15.0,0,0,8.0292,Queenstown,1,0,1,0,0
878,878,0,3,male,29.699118,0,0,7.8958,Southampton,1,1,0,1,0
15,15,1,2,female,55.0,0,0,16.0,Southampton,1,0,0,1,0
101,101,0,3,male,29.699118,0,0,7.8958,Southampton,1,1,0,1,0


In [28]:
# Get baseline accuracy
baseline_accuracy = (train.survived_baseline == train.survived).mean()
baseline_accuracy

0.6164658634538153

#### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [59]:
x_cols = train.columns.to_list()
drop = ['sex', 'embark_town', 'survived']
x_cols = [col for col in x_cols if col not in drop]
y_cols = "survived"
x_cols


['passenger_id',
 'pclass',
 'age',
 'sibsp',
 'parch',
 'fare',
 'alone',
 'sex_male',
 'embark_town_Queenstown',
 'embark_town_Southampton',
 'survived_baseline']

In [156]:
clf = DecisionTreeClassifier()
clf.fit(train[x_cols], train[y_cols])

In [157]:
model1_preds = clf.predict(train[x_cols])


In [134]:
titanic_model = pd.DataFrame(train.survived)
titanic_model = titanic_model.rename(columns={'survived': "actual"})
titanic_model['model1'] = model1_preds
titanic_model

Unnamed: 0,actual,model1
583,0,0
22,1,1
878,0,0
15,1,1
101,0,0
...,...,...
573,1,1
95,0,0
396,0,0
245,0,0


#### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [90]:
clf.score(train[x_cols], train[y_cols])

1.0

In [109]:

matrix = confusion_matrix(train.survived, model1_preds[0], labels=(0, 1))
matrix

array([[307,   0],
       [  0, 191]])

In [92]:
print(classification_report(train[y_cols], model1_preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       307
           1       1.00      1.00      1.00       191

    accuracy                           1.00       498
   macro avg       1.00      1.00      1.00       498
weighted avg       1.00      1.00      1.00       498



#### 4.Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [114]:
TN = matrix[0,0]
FN = matrix[0,1]
FP = matrix[1, 0]
TP = matrix[1,1]


In [135]:
survived_accuracy = (titanic_model.model1 == titanic_model.actual).mean()
survived_accuracy

1.0

In [139]:
subset = titanic_model[titanic_model.actual == 1]
titanic_model_recall = (subset.model1 == subset.actual).mean()
titanic_model_recall

1.0

In [142]:
subset = titanic_model[titanic_model.model1 == 1]
model_precision = (subset.model1 == subset.actual).mean()
model_precision

1.0

In [144]:
titanic_model_F1 = 2 * (titanic_model_recall * model_precision) / (titanic_model_recall + model_precision)
titanic_model_F1

1.0

In [150]:
model_support = [(TP + FP), (TN + FN)]
model_support

[191, 307]

#### 5. Run through steps 2-4 using a different max_depth value.

In [171]:
clf2 = DecisionTreeClassifier(max_depth=2)
clf2.fit(train[x_cols], train[y_cols])

In [162]:
model2_preds = clf2.predict(train[x_cols])

In [163]:
titanic_model['model2'] = model2_preds
titanic_model

Unnamed: 0,actual,model1,model2
583,0,0,0
22,1,1,0
878,0,0,0
15,1,1,1
101,0,0,0
...,...,...,...
573,1,1,0
95,0,0,0
396,0,0,0
245,0,0,0


In [164]:
clf2.score(train[x_cols], train[y_cols])

0.7911646586345381

In [165]:
print(classification_report(train[y_cols], model2_preds))

              precision    recall  f1-score   support

           0       0.76      0.98      0.85       307
           1       0.93      0.49      0.64       191

    accuracy                           0.79       498
   macro avg       0.84      0.73      0.75       498
weighted avg       0.82      0.79      0.77       498



In [170]:
clf3 =  DecisionTreeClassifier(max_depth=3)
clf3.fit(train[x_cols], train[y_cols])

In [172]:
model3_preds = clf3.predict(train[x_cols])

In [173]:
titanic_model['model3'] = model3_preds
titanic_model

Unnamed: 0,actual,model1,model2,model3
583,0,0,0,0
22,1,1,0,1
878,0,0,0,0
15,1,1,1,1
101,0,0,0,0
...,...,...,...,...
573,1,1,0,1
95,0,0,0,0
396,0,0,0,1
245,0,0,0,0


In [174]:
clf3.score(train[x_cols], train[y_cols])

0.8192771084337349

In [175]:
print(classification_report(train[y_cols], model3_preds))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       307
           1       0.81      0.69      0.74       191

    accuracy                           0.82       498
   macro avg       0.82      0.79      0.80       498
weighted avg       0.82      0.82      0.82       498



#### 6. Which model performs better on your in-sample data?

So far with 3 different models, Model1 preforms the best but most likely due to no depth limit which could casue model to be over fit. Changing the depth level to 3 gave an accuracy of 82% and could be more accurate with test data

#### 7. Which model performs best on your out-of-sample data, the validate set?

In [177]:
x_val, y_val = validate[x_cols], validate[y_cols]

In [184]:
print(f'''
Performance in accuracy of Decision Tree 1 on training data:
Accuracy (train): {clf.score(train[x_cols], train[y_cols])}
Accuracy (validate): {round(clf.score(x_val, y_val), 2)}''')
        


Performance in accuracy of Decision Tree 1 on training data:
Accuracy (train): 1.0
Accuracy (validate): 0.75


In [185]:
print(f'''
Performance in accuracy of Decision Tree 1 on training data:
Accuracy (train): {clf2.score(train[x_cols], train[y_cols])}
Accuracy (validate): {round(clf2.score(x_val, y_val), 2)}''')


Performance in accuracy of Decision Tree 1 on training data:
Accuracy (train): 0.7911646586345381
Accuracy (validate): 0.8


In [186]:
print(f'''
Performance in accuracy of Decision Tree 1 on training data:
Accuracy (train): {clf3.score(train[x_cols], train[y_cols])}
Accuracy (validate): {round(clf3.score(x_val, y_val), 2)}''')


Performance in accuracy of Decision Tree 1 on training data:
Accuracy (train): 0.8192771084337349
Accuracy (validate): 0.84


Model 3 still works the best with a step up from 82% to 84% accuracy 
