In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from pydataset import data

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

from acquire import get_titanic_data
from prepare_pro import generic_split

import graphviz
from graphviz import Graph

In [2]:
# Using the titanic data, in your classification-exercises repository,
# create a notebook, model.ipynb where you will do the following:
titanic = get_titanic_data()
titanic.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   passenger_id  891 non-null    int64  
 1   survived      891 non-null    int64  
 2   pclass        891 non-null    int64  
 3   sex           891 non-null    object 
 4   age           714 non-null    float64
 5   sibsp         891 non-null    int64  
 6   parch         891 non-null    int64  
 7   fare          891 non-null    float64
 8   embarked      889 non-null    object 
 9   class         891 non-null    object 
 10  deck          203 non-null    object 
 11  embark_town   889 non-null    object 
 12  alone         891 non-null    int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 90.6+ KB


In [3]:
titanic = titanic[titanic.age.isna() == False]

In [4]:
titanic['is_male'] = titanic.sex == 'male'
dummy_df = pd.get_dummies(titanic[['class','embark_town']])
dummy_df

Unnamed: 0,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,0,1,0,0,1
1,1,0,0,1,0,0
2,0,0,1,0,0,1
3,1,0,0,0,0,1
4,0,0,1,0,0,1
...,...,...,...,...,...,...
885,0,0,1,0,1,0
886,0,1,0,0,0,1
887,1,0,0,0,0,1
889,1,0,0,1,0,0


In [5]:
titanic = pd.concat([titanic, dummy_df],axis=1)

In [6]:
titanic.drop(columns=['passenger_id','pclass','embarked','deck','sex','embark_town','class'],inplace=True)
titanic.head(1)

Unnamed: 0,survived,age,sibsp,parch,fare,alone,is_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,22.0,1,0,7.25,0,True,0,0,1,0,0,1


In [7]:
titanic.survived.value_counts()

0    424
1    290
Name: survived, dtype: int64

In [8]:
#1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for
# a classification problem is predicting the most prevelant class in the training dataset (the mode). 
# When you make those predictions, what is your accuracy? This is your baseline accuracy.
titanic['baseline_prediction'] = 0
titanic.head(1) #baseline prediction is 0 == did not survive

Unnamed: 0,survived,age,sibsp,parch,fare,alone,is_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,baseline_prediction
0,0,22.0,1,0,7.25,0,True,0,0,1,0,0,1,0


In [9]:
train, validate, test = generic_split(titanic, stratify_by='survived')

In [10]:
baseline_accuracy = (train.survived == train.baseline_prediction).mean()
baseline_accuracy

0.5939849624060151

In [11]:
#2. Fit the decision tree classifier to your training sample and transform 
# (i.e. make predictions on the training sample)
x_train = train.drop(columns=['survived'])
y_train = train.survived # labeled data == supervise algorithm

x_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

x_test = test.drop(columns=['survived'])
y_test = test.survived

In [12]:
clf = DecisionTreeClassifier(max_depth=2, random_state=123)
clf = clf.fit(x_train,y_train)

In [13]:
dot_data = export_graphviz(clf, feature_names= x_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('Titanic_decision_tree', view=True, format="pdf")

'Titanic_decision_tree.pdf'

In [19]:
y_pred = clf.predict(x_train)
y_pred[0:3]

array([0, 0, 1])

In [20]:
y_pred_proba = clf.predict_proba(x_train)
y_pred_proba[0:3]

array([[0.825     , 0.175     ],
       [0.58      , 0.42      ],
       [0.06315789, 0.93684211]])

In [23]:
confusion_matrix(y_train, y_pred,
                 labels = [0, 1])

array([[227,  10],
       [ 63,  99]])

In [21]:
#3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(x_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.82


In [22]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.96      0.86       237
           1       0.91      0.61      0.73       162

    accuracy                           0.82       399
   macro avg       0.85      0.78      0.80       399
weighted avg       0.83      0.82      0.81       399



In [25]:
#4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, 
# false negative rate, precision, recall, f1-score, and support.
print("Model 1")
pd.DataFrame(classification_report(y_train, y_pred,output_dict=True)).T

Model 1


Unnamed: 0,precision,recall,f1-score,support
0,0.782759,0.957806,0.86148,237.0
1,0.908257,0.611111,0.730627,162.0
accuracy,0.817043,0.817043,0.817043,0.817043
macro avg,0.845508,0.784459,0.796054,399.0
weighted avg,0.833713,0.817043,0.808352,399.0


In [28]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(x_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.76


In [16]:
#5. Run through steps 2-4 using a different max_depth value.



In [17]:
#6. Which model performs better on your in-sample data?



In [18]:
#7. Which model performs best on your out-of-sample data, the validate set?

