In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from pydataset import data

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

from acquire import get_titanic_data
from prepare_pro import generic_split

import graphviz
from graphviz import Graph

import warnings
warnings.filterwarnings("ignore")

from sklearn.ensemble import RandomForestClassifier

In [2]:
# Using the titanic data, in your classification-exercises repository,
# create a notebook, model.ipynb where you will do the following:
titanic = get_titanic_data()
titanic.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   passenger_id  891 non-null    int64  
 1   survived      891 non-null    int64  
 2   pclass        891 non-null    int64  
 3   sex           891 non-null    object 
 4   age           714 non-null    float64
 5   sibsp         891 non-null    int64  
 6   parch         891 non-null    int64  
 7   fare          891 non-null    float64
 8   embarked      889 non-null    object 
 9   class         891 non-null    object 
 10  deck          203 non-null    object 
 11  embark_town   889 non-null    object 
 12  alone         891 non-null    int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 90.6+ KB


In [3]:
titanic = titanic[titanic.age.isna() == False]

In [4]:
titanic['is_male'] = titanic.sex == 'male'
dummy_df = pd.get_dummies(titanic[['class','embark_town']])
dummy_df

Unnamed: 0,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,0,1,0,0,1
1,1,0,0,1,0,0
2,0,0,1,0,0,1
3,1,0,0,0,0,1
4,0,0,1,0,0,1
...,...,...,...,...,...,...
885,0,0,1,0,1,0
886,0,1,0,0,0,1
887,1,0,0,0,0,1
889,1,0,0,1,0,0


In [5]:
titanic = pd.concat([titanic, dummy_df],axis=1)

In [6]:
titanic.drop(columns=['passenger_id','pclass','embarked','deck','sex','embark_town','class'],inplace=True)
titanic.head(1)

Unnamed: 0,survived,age,sibsp,parch,fare,alone,is_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,22.0,1,0,7.25,0,True,0,0,1,0,0,1


In [7]:
titanic.survived.value_counts()

0    424
1    290
Name: survived, dtype: int64

In [8]:
#1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for
# a classification problem is predicting the most prevelant class in the training dataset (the mode). 
# When you make those predictions, what is your accuracy? This is your baseline accuracy.
titanic['baseline_prediction'] = 0
titanic.head(1) #baseline prediction is 0 == did not survive

Unnamed: 0,survived,age,sibsp,parch,fare,alone,is_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,baseline_prediction
0,0,22.0,1,0,7.25,0,True,0,0,1,0,0,1,0


In [9]:
train, validate, test = generic_split(titanic, stratify_by='survived')

In [10]:
baseline_accuracy = (train.survived == train.baseline_prediction).mean()
baseline_accuracy

0.5939849624060151

In [11]:
#2. Fit the decision tree classifier to your training sample and transform 
# (i.e. make predictions on the training sample)
x_train = train.drop(columns=['survived'])
y_train = train.survived # labeled data == supervise algorithm

x_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

x_test = test.drop(columns=['survived'])
y_test = test.survived

In [12]:
clf = DecisionTreeClassifier(max_depth=2, random_state=123)
clf = clf.fit(x_train,y_train)

In [13]:
dot_data = export_graphviz(clf, feature_names= x_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('Titanic_decision_tree', view=True, format="pdf")

'Titanic_decision_tree.pdf'

In [14]:
y_pred = clf.predict(x_train)
y_pred[0:3]

array([0, 0, 1])

In [15]:
y_pred_proba = clf.predict_proba(x_train)
y_pred_proba[0:3]

array([[0.825     , 0.175     ],
       [0.58      , 0.42      ],
       [0.06315789, 0.93684211]])

In [16]:
confusion_matrix(y_train, y_pred,
                 labels = [0, 1])

array([[227,  10],
       [ 63,  99]])

In [17]:
#3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(x_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.82


In [18]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.96      0.86       237
           1       0.91      0.61      0.73       162

    accuracy                           0.82       399
   macro avg       0.85      0.78      0.80       399
weighted avg       0.83      0.82      0.81       399



In [19]:
#4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, 
# false negative rate, precision, recall, f1-score, and support.
print("Model 1")
pd.DataFrame(classification_report(y_train, y_pred,output_dict=True)).T

Model 1


Unnamed: 0,precision,recall,f1-score,support
0,0.782759,0.957806,0.86148,237.0
1,0.908257,0.611111,0.730627,162.0
accuracy,0.817043,0.817043,0.817043,0.817043
macro avg,0.845508,0.784459,0.796054,399.0
weighted avg,0.833713,0.817043,0.808352,399.0


In [20]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(x_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.76


In [21]:
#5. Run through steps 2-4 using a different max_depth value.
clf_difdepth = DecisionTreeClassifier(max_depth=4, random_state=123)
clf_difdepth = clf_difdepth.fit(x_train,y_train)

In [22]:
dot_data2 = export_graphviz(clf_difdepth, feature_names= x_train.columns, rounded=True, filled=True, out_file=None, class_names=['did_not_survive','survived'])
graph2 = graphviz.Source(dot_data2) 

graph2.render('Titanic_decision_tree', view=True, format="pdf")

'Titanic_decision_tree.pdf'

In [23]:
y_pred2 = clf_difdepth.predict(x_train)

In [24]:
confusion_matrix(y_train, y_pred2,
                 labels = [0, 1])

array([[231,   6],
       [ 52, 110]])

In [25]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf_difdepth.score(x_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.85


In [26]:
print("Model 2")
pd.DataFrame(classification_report(y_train, y_pred2,output_dict=True)).T

Model 2


Unnamed: 0,precision,recall,f1-score,support
0,0.816254,0.974684,0.888462,237.0
1,0.948276,0.679012,0.791367,162.0
accuracy,0.854637,0.854637,0.854637,0.854637
macro avg,0.882265,0.826848,0.839914,399.0
weighted avg,0.869857,0.854637,0.84904,399.0


In [27]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf_difdepth.score(x_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.78


In [28]:
#6. Which model performs better on your in-sample data? 
# model 2 has higher scores overall

print("Model 1")
pd.DataFrame(classification_report(y_train, y_pred,output_dict=True)).T

Model 1


Unnamed: 0,precision,recall,f1-score,support
0,0.782759,0.957806,0.86148,237.0
1,0.908257,0.611111,0.730627,162.0
accuracy,0.817043,0.817043,0.817043,0.817043
macro avg,0.845508,0.784459,0.796054,399.0
weighted avg,0.833713,0.817043,0.808352,399.0


In [29]:
print("Model 2")
pd.DataFrame(classification_report(y_train, y_pred2,output_dict=True)).T

Model 2


Unnamed: 0,precision,recall,f1-score,support
0,0.816254,0.974684,0.888462,237.0
1,0.948276,0.679012,0.791367,162.0
accuracy,0.854637,0.854637,0.854637,0.854637
macro avg,0.882265,0.826848,0.839914,399.0
weighted avg,0.869857,0.854637,0.84904,399.0


In [30]:
#7. Which model performs best on your out-of-sample data, the validate set?
print('Accuracy of Decision Tree classifier (2 depth) on validate set: {:.2f}'
     .format(clf.score(x_validate, y_validate)))


Accuracy of Decision Tree classifier (2 depth) on validate set: 0.76


In [31]:
print('Accuracy of Decision Tree classifier (4 depth) on validate set: {:.2f}'
     .format(clf_difdepth.score(x_validate, y_validate)))

Accuracy of Decision Tree classifier (4 depth) on validate set: 0.78


In [32]:
# model 2 has higher accuracy on the validate set

In [33]:
y_pred_validate = clf.predict(x_validate)
y_pred_validate2 = clf_difdepth.predict(x_validate)

In [34]:
print("Model 1 validate")
pd.DataFrame(classification_report(y_validate, y_pred_validate,output_dict=True)).T

Model 1 validate


Unnamed: 0,precision,recall,f1-score,support
0,0.744,0.911765,0.819383,102.0
1,0.808511,0.542857,0.649573,70.0
accuracy,0.761628,0.761628,0.761628,0.761628
macro avg,0.776255,0.727311,0.734478,172.0
weighted avg,0.770254,0.761628,0.750274,172.0


In [35]:
print("Model 2 validate")
pd.DataFrame(classification_report(y_validate, y_pred_validate2,output_dict=True)).T

Model 2 validate


Unnamed: 0,precision,recall,f1-score,support
0,0.75,0.941176,0.834783,102.0
1,0.863636,0.542857,0.666667,70.0
accuracy,0.77907,0.77907,0.77907,0.77907
macro avg,0.806818,0.742017,0.750725,172.0
weighted avg,0.796247,0.77907,0.766363,172.0


In [36]:
confusion_matrix(y_validate, y_pred_validate,
                 labels = [0, 1])

array([[93,  9],
       [32, 38]])

In [37]:
confusion_matrix(y_validate, y_pred_validate2,
                 labels = [0, 1])

array([[96,  6],
       [32, 38]])

In [38]:
# model 2 reduced false positive rate

In [53]:
#1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) 
# setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

In [54]:
rf = rf.fit(x_train, y_train)

In [55]:
print(rf.feature_importances_)

[0.25746175 0.04075686 0.04345531 0.23247315 0.01531672 0.25673994
 0.02867526 0.0194732  0.07377139 0.01385568 0.00612958 0.01189114
 0.        ]


In [56]:
y_pred = rf.predict(x_train)

In [57]:
#2. Evaluate your results using the model score, confusion matrix, and classification report.

print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(x_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


In [58]:
print(confusion_matrix(y_train, y_pred))

[[236   1]
 [ 12 150]]


In [59]:
print("Model 1 Random Forest")
pd.DataFrame(classification_report(y_train, y_pred,output_dict=True)).T

Model 1 Random Forest


Unnamed: 0,precision,recall,f1-score,support
0,0.951613,0.995781,0.973196,237.0
1,0.993377,0.925926,0.958466,162.0
accuracy,0.967419,0.967419,0.967419,0.967419
macro avg,0.972495,0.960853,0.965831,399.0
weighted avg,0.96857,0.967419,0.967216,399.0


In [69]:
#3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, 
# false negative rate, precision, recall, f1-score, and support.
report1 = pd.DataFrame(classification_report(y_train, y_pred,output_dict=True)).T


In [85]:
print(f'Accuracy: {round(report1.precision.accuracy,3)}')
print(f'True Positive Rate: {round(report1.recall[1],3)}')
print(f'False Positive Rate: {round(1 - report1.recall[1],3)}')
print(f'True Negative Rate: {round(report1.recall[0],3)}')
print(f'False Negative Rate: {round(1 - report1.recall[0],3)}')

Accuracy: 0.967
True Positive Rate: 0.926
False Positive Rate: 0.074
True Negative Rate: 0.996
False Negative Rate: 0.004


In [None]:
#4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.


In [None]:
#5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?


In [None]:
# After making a few models, which one has the best performance (or closest metrics) on both train and validate?
