In [51]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import tree

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df = data('titanic')
df.head()

Unnamed: 0,class,age,sex,survived
1,1st class,adults,man,yes
2,1st class,adults,man,yes
3,1st class,adults,man,yes
4,1st class,adults,man,yes
5,1st class,adults,man,yes


In [3]:
df.replace({'class':{'1st class': 1, '2nd class': 2, '3rd class': 3}}, inplace=True)

In [4]:
df.replace({'age':{'adults': 2, 'child': 1}}, inplace=True)

In [5]:
df.replace({'sex':{'man': 0, 'women': 1}}, inplace=True)

In [6]:
df.survived.value_counts()

no     817
yes    499
Name: survived, dtype: int64

In [37]:
#checking for nulls, if there were, use: df.column.fillna(value=df.column.mode())
df.isna().sum()

class       0
age         0
sex         0
survived    0
baseline    0
dtype: int64

###### What is your baseline predicition?

In [7]:
df['baseline'] = 'no'
df.head()

Unnamed: 0,class,age,sex,survived,baseline
1,1,2,0,yes,no
2,1,2,0,yes,no
3,1,2,0,yes,no
4,1,2,0,yes,no
5,1,2,0,yes,no


###### What is your baseline accuracy?

In [39]:
round((df.survived == df.baseline).mean(), 2)

0.62

###### Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [9]:
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [10]:
train, validate, test = train_validate_test_split(df, target='survived', seed=123)

In [11]:
X_train = train.drop(columns=['survived','baseline'])
y_train = train.survived

X_validate = validate.drop(columns=['survived','baseline'])
y_validate = validate.survived

X_test = test.drop(columns=['survived','baseline'])
y_test = test.survived

In [12]:
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [13]:
clf = clf.fit(X_train, y_train)

In [14]:
import graphviz
from graphviz import Graph

dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data)

In [15]:
graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

###### Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [16]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array(['no', 'no', 'no', 'no', 'no'], dtype=object)

In [17]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.78


In [18]:
confusion_matrix(y_train, y_pred)

array([[443,  14],
       [145, 134]])

In [19]:
y_train.value_counts()

no     457
yes    279
Name: survived, dtype: int64

In [20]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,no,yes
no,443,14
yes,145,134


Create a classificaiton report

Precision: the higher this number is, the more you were able to pinpoint all positives correctly. If this is a low score, you predicted a lot of positives where there were none. 
TP/(TP+FP)

Recall: if this score is high, you didn’t miss a lot of positives. But as it gets lower, you are not predicting the positives that are actually there. 
TP/(TP+FN)

f1-score: The balanced harmonic mean of Recall and Precision, giving both metrics equal weight. The higher the F-Measure is, the better. 

Support: number of occurrences of each class in where y is true.

In [21]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

          no       0.75      0.97      0.85       457
         yes       0.91      0.48      0.63       279

    accuracy                           0.78       736
   macro avg       0.83      0.72      0.74       736
weighted avg       0.81      0.78      0.76       736



###### Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [22]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.81


In [23]:
# Produce y_predictions that come from the X_validate
y_pred = clf.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

          no       0.77      0.98      0.87       196
         yes       0.95      0.53      0.68       120

    accuracy                           0.81       316
   macro avg       0.86      0.75      0.77       316
weighted avg       0.84      0.81      0.79       316



###### Run through steps 2-4 using a different max_depth value.



In [24]:
clf = DecisionTreeClassifier(max_depth=2, random_state=123)

clf = clf.fit(X_train, y_train)

dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data)

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [25]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array(['no', 'no', 'no', 'no', 'no'], dtype=object)

In [26]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.78


In [27]:
confusion_matrix(y_train, y_pred)

array([[443,  14],
       [147, 132]])

In [28]:
y_train.value_counts()

no     457
yes    279
Name: survived, dtype: int64

In [29]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,no,yes
no,443,14
yes,147,132


In [30]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

          no       0.75      0.97      0.85       457
         yes       0.90      0.47      0.62       279

    accuracy                           0.78       736
   macro avg       0.83      0.72      0.73       736
weighted avg       0.81      0.78      0.76       736



In [31]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.80


In [32]:
# Produce y_predictions that come from the X_validate
y_pred = clf.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

          no       0.76      0.98      0.86       196
         yes       0.95      0.50      0.66       120

    accuracy                           0.80       316
   macro avg       0.86      0.74      0.76       316
weighted avg       0.83      0.80      0.78       316



### Random Forest

In [43]:
df.head()

Unnamed: 0,class,age,sex,survived,baseline
1,1,2,0,yes,no
2,1,2,0,yes,no
3,1,2,0,yes,no
4,1,2,0,yes,no
5,1,2,0,yes,no


Train Validate Test

In [44]:
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [45]:
train, validate, test = train_validate_test_split(df, target='survived', seed=123)

In [54]:
X_train = train.drop(columns=['survived','baseline'])
y_train = train.survived

X_validate = validate.drop(columns=['survived','baseline'])
y_validate = validate.survived

X_test = test.drop(columns=['survived','baseline'])
y_test = test.survived

In [55]:
# Create the object
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)

In [56]:
# Fit the model
rf.fit(X_train, y_train)
RandomForestClassifier(max_depth=3, min_samples_leaf=3, random_state=123)

RandomForestClassifier(max_depth=3, min_samples_leaf=3, random_state=123)

In [57]:
# Feature Importance
print(rf.feature_importances_)

[0.3647712  0.02423182 0.61099698]


In [58]:
# Make Predictions
y_pred = rf.predict(X_train)

In [59]:
# Estimate Probability
y_pred_proba = rf.predict_proba(X_train)

In [60]:
# Compute the Accuracy
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.79


In [61]:
# Create a confusion matrix
print(confusion_matrix(y_train, y_pred))

[[443  14]
 [140 139]]


In [62]:
# Create a classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

          no       0.76      0.97      0.85       457
         yes       0.91      0.50      0.64       279

    accuracy                           0.79       736
   macro avg       0.83      0.73      0.75       736
weighted avg       0.82      0.79      0.77       736



In [63]:
# Validate Model
# Evaluate on Out-of-Sample data
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

Accuracy of random forest classifier on test set: 0.82


In [64]:
# Run through steps increasing min_samples_leaf and decreasing max_depth
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=1, 
                            random_state=123)

In [65]:
# Fit the model
rf.fit(X_train, y_train)
RandomForestClassifier(max_depth=3, min_samples_leaf=3, random_state=123)
# Make Predictions
y_pred = rf.predict(X_train)
# Estimate Probability
y_pred_proba = rf.predict_proba(X_train)
# Compute the Accuracy
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.73


In [66]:
# Create a confusion matrix
print(confusion_matrix(y_train, y_pred))

[[455   2]
 [194  85]]


In [67]:
# Create a classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

          no       0.70      1.00      0.82       457
         yes       0.98      0.30      0.46       279

    accuracy                           0.73       736
   macro avg       0.84      0.65      0.64       736
weighted avg       0.81      0.73      0.69       736

