In [80]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from acquire import get_titanic_data, get_iris_data
from prepare import prep_titanic
import graphviz
from graphviz import Graph
import warnings
warnings.filterwarnings('ignore')

In [81]:
from sklearn.model_selection import train_test_split

def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

## Baseline Prediction and Basic Exploration

In [82]:
titanic = get_titanic_data()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [83]:
for col, vals in titanic.iteritems():
    print(titanic[f'{col}'].value_counts())

0      1
598    1
587    1
588    1
589    1
      ..
300    1
301    1
302    1
303    1
890    1
Name: passenger_id, Length: 891, dtype: int64
0    549
1    342
Name: survived, dtype: int64
3    491
1    216
2    184
Name: pclass, dtype: int64
male      577
female    314
Name: sex, dtype: int64
24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: age, Length: 88, dtype: int64
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: sibsp, dtype: int64
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: parch, dtype: int64
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
35.0000     1
28.5000     1
6.2375      1
14.0000     1
10.5167     1
Name: fare, Length: 248, dtype: int64
S    644
C    168
Q     77
Name: embarked, dtype: int64
Third     491
First     216
Second    184
Name: class, dtype: int64
C    59
B    47
D    33
E    32
A    15


In [84]:
titanic['baseline'] = 0
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,baseline
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,0
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,0


In [85]:
(titanic.survived == titanic.baseline).mean()

0.6161616161616161

Baseline estimate: ~61.62%

## Modeling

In [86]:
train, validate, test = prep_titanic(get_titanic_data())
train.shape, validate.shape, test.shape

((497, 14), (214, 14), (178, 14))

In [87]:
train.drop(columns=['sex'], inplace=True)
validate.drop(columns=['sex'], inplace=True)
test.drop(columns=['sex'], inplace=True)

In [88]:
train.head(15)

Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,class,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
583,583,0,36.0,0,0,40.125,1,1,0,1,1,0,0
337,337,1,41.0,0,0,134.5,1,1,1,0,1,0,0
50,50,0,7.0,4,1,39.6875,3,0,0,1,0,0,1
218,218,1,32.0,0,0,76.2917,1,1,1,0,1,0,0
31,31,1,29.916875,1,0,146.5208,1,0,1,0,1,0,0
308,308,0,30.0,1,0,24.0,2,0,0,1,1,0,0
314,314,0,43.0,1,1,26.25,2,0,0,1,0,0,1
883,883,0,28.0,0,0,10.5,2,1,0,1,0,0,1
459,459,0,29.916875,0,0,7.75,3,1,0,1,0,1,0
180,180,0,29.916875,8,2,69.55,3,0,1,0,0,0,1


In [89]:
X_train, y_train = train.drop(columns='survived'), train.survived
X_validate, y_validate = validate.drop(columns='survived'), validate.survived
X_test, y_test = test.drop(columns='survived'), test.survived

In [90]:
clf = DecisionTreeClassifier(max_depth=3, random_state=123)
clf = clf.fit(X_train, y_train)

In [91]:
dot_data = export_graphviz(clf, feature_names= X_train.columns,class_names=['died','lived'], rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [92]:
accuracy = round(clf.score(X_train, y_train), 2)
print(f'Training accuracy: {accuracy}')

Training accuracy: 0.82


In [93]:
y_pred = clf.predict(X_train)

In [94]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

array([[0.51666667, 0.48333333],
       [0.0326087 , 0.9673913 ],
       [0.88      , 0.12      ],
       [0.0326087 , 0.9673913 ],
       [0.0326087 , 0.9673913 ],
       [0.88      , 0.12      ],
       [0.88      , 0.12      ],
       [0.88      , 0.12      ],
       [0.88      , 0.12      ],
       [0.93333333, 0.06666667],
       [0.93333333, 0.06666667],
       [0.92857143, 0.07142857],
       [0.0326087 , 0.9673913 ],
       [0.51666667, 0.48333333],
       [0.88      , 0.12      ],
       [0.92857143, 0.07142857],
       [0.88      , 0.12      ],
       [0.51666667, 0.48333333],
       [0.88      , 0.12      ],
       [0.4137931 , 0.5862069 ],
       [0.0326087 , 0.9673913 ],
       [0.88      , 0.12      ],
       [0.0326087 , 0.9673913 ],
       [0.4137931 , 0.5862069 ],
       [0.51666667, 0.48333333],
       [0.88      , 0.12      ],
       [0.0326087 , 0.9673913 ],
       [0.0326087 , 0.9673913 ],
       [0.51666667, 0.48333333],
       [0.88      , 0.12      ],
       [0.

In [95]:
train['prediction'] = clf.predict(X_train)
train[['prediction', 'survived']]

Unnamed: 0,prediction,survived
583,0,0
337,1,1
50,0,0
218,1,1
31,1,1
...,...,...
313,0,0
636,0,0
222,0,0
485,0,0


In [96]:
confusion1 = confusion_matrix(y_train, y_pred)
con_df = pd.DataFrame(confusion_matrix(y_train, y_pred))
con_df.rename(columns = {0: 'died', 1: 'lived'}, index = {0: 'died', 1: 'lived'})

Unnamed: 0,died,lived
died,279,28
lived,62,128


## Compute:

In [97]:
print(classification_report(train.survived, train.prediction, zero_division=True))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86       307
           1       0.82      0.67      0.74       190

    accuracy                           0.82       497
   macro avg       0.82      0.79      0.80       497
weighted avg       0.82      0.82      0.81       497



In [98]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.82


In [99]:
TP = confusion1[0,0]
TN = confusion1[1,1]
FP = confusion1[0,1]
FN = confusion1[1,0]

AR = (TP+TN)/(TP+FP+FN+TN)
TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(TN+FP)
FNR = FN/(FN+TP)

print(f'Accuracy: {AR:.2%}\nTrue Positive Rate: {TPR:.2%}\nFalse Positive Rate: {FPR:.2%}\nTrue Negaitve Rate: {TNR:.2%}\nFalse Negative Rate: {FNR:.2%}')

Accuracy: 81.89%
True Positive Rate: 81.82%
False Positive Rate: 17.95%
True Negaitve Rate: 82.05%
False Negative Rate: 18.18%


#### Analysis: 
The model performs better on in-sample data compared to the baseline of 61.62%

### Out-of-sample

In [100]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.79


In [101]:
y_pred = clf.predict(X_validate)
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       132
           1       0.77      0.65      0.70        82

    accuracy                           0.79       214
   macro avg       0.78      0.76      0.77       214
weighted avg       0.79      0.79      0.79       214



In [102]:
confusion2 = confusion_matrix(y_validate, y_pred)
con_df2 = pd.DataFrame(confusion_matrix(y_validate, y_pred))
con_df2.rename(columns = {0: 'died', 1: 'lived'}, index = {0: 'died', 1: 'lived'})

Unnamed: 0,died,lived
died,116,16
lived,29,53


In [103]:
TP = confusion2[0,0]
TN = confusion2[1,1]
FP = confusion2[0,1]
FN = confusion2[1,0]

AR = (TP+TN)/(TP+FP+FN+TN)
TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(TN+FP)
FNR = FN/(FN+TP)

print(f'Accuracy: {AR:.2%}\nTrue Positive Rate: {TPR:.2%}\nFalse Positive Rate: {FPR:.2%}\nTrue Negaitve Rate: {TNR:.2%}\nFalse Negative Rate: {FNR:.2%}')

Accuracy: 78.97%
True Positive Rate: 80.00%
False Positive Rate: 23.19%
True Negaitve Rate: 76.81%
False Negative Rate: 20.00%


#### Analysis: 
The model performs better on out-of-sample data compared to the baseline of 61.62%

Additionally, the best way to minimize over/underfitting is to keep the `max_depth` value at 3