In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import acquire
import prepare
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import graphviz

from graphviz import Graph

In [164]:
train, test, validate = prepare.prep_titanic()
train, test, validate = prepare.impute(train, test, validate, my_strategy = 'mean', column_list = 'age')


In [165]:
train = train.replace({'class':{'First':'1', 'Second': '2', 'Third': '3'}})
validate = validate.replace({'class':{'First':'1', 'Second': '2', 'Third': '3'}})
test = test.replace({'class':{'First':'1', 'Second': '2', 'Third': '3'}})

In [168]:
train
validate
test

Unnamed: 0,survived,age,sibsp,parch,fare,class,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
519,0,32.000000,0,0,7.8958,3,Southampton,1,0,1,1
330,1,28.450141,2,0,23.2500,3,Queenstown,0,1,0,0
381,1,1.000000,0,2,15.7417,3,Cherbourg,0,0,0,0
234,0,24.000000,0,0,10.5000,2,Southampton,1,0,1,1
736,0,48.000000,1,3,34.3750,3,Southampton,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
738,0,28.450141,0,0,7.8958,3,Southampton,1,0,1,1
272,1,41.000000,0,1,19.5000,2,Southampton,0,0,1,0
122,0,32.500000,1,0,30.0708,2,Cherbourg,0,0,0,1
291,1,19.000000,1,0,91.0792,1,Cherbourg,0,0,0,0


In [62]:
train.survived.value_counts()

0    351
1    217
Name: survived, dtype: int64

## Model 1
#### with max_dept = 3

In [63]:
x_train1 = train[['sex_male', 'class', 'fare', 'age']]

y_train = train.survived


In [64]:
y_train.value_counts()

0    351
1    217
Name: survived, dtype: int64

In [65]:
x_train1

Unnamed: 0,sex_male,class,fare,age
769,1,3,8.3625,32.000000
607,1,1,30.5000,27.000000
661,1,3,7.2250,40.000000
204,1,3,8.0500,18.000000
205,0,3,10.4625,2.000000
...,...,...,...,...
773,1,3,7.2250,29.794372
118,1,1,247.5208,24.000000
550,1,1,110.8833,17.000000
323,0,2,29.0000,22.000000


In [66]:
clf1 = DecisionTreeClassifier(max_depth = 3, random_state = 123)

In [67]:
clf1.fit(x_train1, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [68]:
y_pred1 = clf1.predict(x_train1)
x = pd.Series(y_pred1)
x.value_counts()

0    445
1    123
dtype: int64

In [69]:
accuracy1 = clf1.score(x_train1, y_train)
accuracy1

0.8133802816901409

In [70]:
print(classification_report(y_train, y_pred1))

              precision    recall  f1-score   support

           0       0.78      0.98      0.87       351
           1       0.95      0.54      0.69       217

    accuracy                           0.81       568
   macro avg       0.86      0.76      0.78       568
weighted avg       0.84      0.81      0.80       568



In [102]:
confusion_matrix(y_train, y_pred1)

array([[345,   6],
       [100, 117]])

In [117]:
# calculating true_negative_rate/specificity
TP = 117
TN = 345
FP= 6
FN = 100
true_neg_rate = TN/(TN + FP) * 100
true_neg_rate

98.29059829059828

In [118]:
# Calculating false negative rate
false_neg_rate1 = FN/(FP+TN) * 100
false_neg_rate1

28.49002849002849

In [119]:
# Calculating false positive rate
false_pos_rate1 = FP/(TP +FN) * 100
false_pos_rate1

2.7649769585253456

In [87]:
# Precision score
precision = precision_score(y_train, y_pred1, pos_label = 1)
precision 

0.9512195121951219

In [84]:
# Recall Score
true_post_rate = recall = recall_score(y_train, y_pred1, pos_label = 1)
true_post_rate

0.5391705069124424

In [111]:
# Calculating F1 score
f1_1 = f1_score(y_train, y_pred1, pos_label = 1)
f1_1

0.6882352941176471

In [126]:
# Lets generate the viz using the graphviz

data = export_graphviz(clf1, out_file = None)
graph = graphviz.Source(data)
graph.render('Decision Tree Model 1 For Titanic', view = True)

'Decision Tree Model 1 For Titanic.pdf'

## Model 2
#### with max_depth = 4

In [90]:
x_train2 = train[['sex_male', 'class', 'fare', 'age']]

y_train = train.survived



In [91]:
clf2 = DecisionTreeClassifier(max_depth = 4, random_state = 124)

In [92]:
clf2.fit(x_train2, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=124, splitter='best')

In [93]:
y_pred2 = clf2.predict(x_train2)

In [94]:
clf2.score(x_train2, y_train)

0.823943661971831

In [95]:
print(classification_report(y_train, y_pred2))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86       351
           1       0.81      0.71      0.75       217

    accuracy                           0.82       568
   macro avg       0.82      0.80      0.81       568
weighted avg       0.82      0.82      0.82       568



In [96]:
# column_label = y_train.not_survived.unique()
# confusion2= pd.DataFrame(confusion_matrix(y_train, y_pred2), index = column_label, columns = column_label)

In [97]:
confusion_matrix(y_train, y_pred2)

array([[315,  36],
       [ 64, 153]])

In [98]:
# Precision Score
precision = precision_score(y_train, y_pred2, pos_label = 1)
precision 

0.8095238095238095

In [121]:
# Recall Score
true_post_rate = recall = recall_score(y_train, y_pred2, pos_label = 1)
print(f'The true positive rate is : {true_post_rate}')

The true positive rate is : 0.7050691244239631


In [122]:
# calculating true_negative_rate/specificity
TP = 153
TN = 315
FP = 36
FN = 64
true_neg_rate = TN/(TN+FP)* 100
true_neg_rate
print(f'The true negative rate is : {true_neg_rate}')

The true negative rate is : 89.74358974358975


In [123]:
# Calculating F1 score
f1_2 = f1_score(y_train, y_pred2, pos_label = 1)
print(f'The f1 socre is: {f1_2}')

The f1 socre is: 0.7536945812807881


In [124]:
# Calculating False Negative Rate
TP = 153
TN = 315
FP = 36
FN = 64

false_neg_rate2 = FN/(FP+TN) * 100
print(f'The false negative rate is: {false_neg_rate2}')

The false negative rate is: 18.233618233618234


In [125]:
# False Positive Rate
TP = 153
TN = 315
FP = 36
FN = 64
false_pos_rate2 = FP/(TP +FN) * 100
print(f'The false positive rate is: {false_pos_rate2}')

The false positive rate is: 16.589861751152075


In [127]:
# Creating vizualization of the Tree

data2 = export_graphviz(clf2, out_file = None)
graph = graphviz.Source(data2)
graph.render('Decision Tree Model 2 For Titanic', view = True)

'Decision Tree Model 2 For Titanic.pdf'

## Model 3
#### with max_depth = 5

In [128]:
x_train3 = train[['sex_male', 'class', 'fare', 'age']]

y_train = train.survived


In [131]:
clf3 = DecisionTreeClassifier(max_depth = 5, random_state = 126)
clf3.fit(x_train3, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=126, splitter='best')

In [132]:
y_pred3 = clf3.predict(x_train3)

In [159]:
# Calculating the accuracy of the model

print(f'The accuracy of model 3 is: {clf3.score(x_train3, y_train)}')

The accuracy of model 3 is: 0.8415492957746479


In [135]:
print(classification_report(y_train, y_pred3))

              precision    recall  f1-score   support

           0       0.81      0.97      0.88       351
           1       0.93      0.64      0.75       217

    accuracy                           0.84       568
   macro avg       0.87      0.80      0.82       568
weighted avg       0.86      0.84      0.83       568



In [138]:
confusion_matrix(y_train, y_pred3)

array([[340,  11],
       [ 79, 138]])

In [153]:
# identifying the true pos, true neg, false pos, false neg

TP = 138
TN = 340
FP = 11
FP = 79

In [154]:
# Calculating true positive rate
true_post_rate3 = recall3 = recall_score(y_train, y_pred3, pos_label = 1)
print(f'The true positive rate is : {true_post_rate3}')

The true positive rate is : 0.6359447004608295


In [155]:
# Calculating true negative rate
true_neg_rate3 = TN/(TN+FP)* 100
true_neg_rate3
print(f'The true negative rate is : {true_neg_rate3}')

The true negative rate is : 81.14558472553699


In [156]:
# Calculating F1 score
f1_3 = f1_score(y_train, y_pred3, pos_label = 1)
print(f'The f1 socre is: {f1_2}')

The f1 socre is: 0.7536945812807881


In [157]:
# Calculating false negative rate
false_neg_rate3 = FN/(FP+TN) * 100
print(f'The false negative rate is: {false_neg_rate3}')

The false negative rate is: 15.274463007159905


In [158]:
# Calculating false positive rate
false_pos_rate3 = FP/(TP +FN) * 100
print(f'The false positive rate is: {false_pos_rate3}')

The false positive rate is: 39.10891089108911


In [160]:
data3 = export_graphviz(clf3, out_file = None)
graph = graphviz.Source(data3)
graph.render('Decision Tree Model 3 For Titanic', view = True)

'Decision Tree Model 3 For Titanic.pdf'

## We will be using all of the three models with our validate data set. The best ones will then be used with the test data set

### Validating with model 1

In [170]:
y_val = validate.survived
x_val1 = validate[['sex_male', 'class', 'fare', 'age']]

In [172]:
y_val_pred1 = clf1.predict(x_val1)

In [174]:
print(f'The accuracy of the model is : {clf1.score(x_val1, y_val)}')

The accuracy of the model is : 0.7972027972027972


In [175]:
print(classification_report(y_val, y_val_pred1))

              precision    recall  f1-score   support

           0       0.78      0.93      0.85        88
           1       0.84      0.58      0.69        55

    accuracy                           0.80       143
   macro avg       0.81      0.76      0.77       143
weighted avg       0.80      0.80      0.79       143



### Validating with model 2

In [176]:
y_val = validate.survived
x_val2 = validate[['sex_male', 'class', 'fare', 'age']]

In [177]:
y_val_pred2 = clf2.predict(x_val2)

In [179]:
print(f'The accuracy of the model is : {clf2.score(x_val2, y_val)}')

The accuracy of the model is : 0.8391608391608392


In [180]:
print(classification_report(y_val, y_val_pred2))

              precision    recall  f1-score   support

           0       0.87      0.88      0.87        88
           1       0.80      0.78      0.79        55

    accuracy                           0.84       143
   macro avg       0.83      0.83      0.83       143
weighted avg       0.84      0.84      0.84       143



### Validating with model 3

In [181]:
y_val = validate.survived
x_val3 = validate[['sex_male', 'class', 'fare', 'age']]

In [182]:
y_val_pred3 = clf3.predict(x_val3)

In [183]:
print(f'The accuracy of the model is : {clf3.score(x_val3, y_val)}')

The accuracy of the model is : 0.8321678321678322


In [184]:
print(classification_report(y_val, y_val_pred3))

              precision    recall  f1-score   support

           0       0.82      0.93      0.87        88
           1       0.86      0.67      0.76        55

    accuracy                           0.83       143
   macro avg       0.84      0.80      0.81       143
weighted avg       0.84      0.83      0.83       143



## Out of all three models, model 2's accuracy was much better than other 2. So, we will use model 2 with the test data set

#### Testing with model 2

In [185]:
y_test = test.survived
x_test = test[['sex_male', 'class', 'fare', 'age']]

In [186]:
y_test_pred = clf2.predict(x_test)

In [187]:
print(f'The accuracy of the model is : {clf2.score(x_test, y_test)}')

The accuracy of the model is : 0.8202247191011236


In [188]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86       110
           1       0.79      0.72      0.75        68

    accuracy                           0.82       178
   macro avg       0.81      0.80      0.81       178
weighted avg       0.82      0.82      0.82       178

