Create a new notebook, random_forests, and work with titanic data to do the following:

In [11]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

import acquire
import prepare

In [19]:
df_titanic = acquire.get_titanic_data()
df_titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [20]:
df_titanic = prepare.prep_titanic(df_titanic)
df_titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,alone,embarked_Q,embarked_S
0,0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,1,0,38.0,1,0,71.2833,0,0,0
2,2,1,3,0,26.0,0,0,7.925,1,0,1
3,3,1,1,0,35.0,1,0,53.1,0,0,1
4,4,0,3,1,35.0,0,0,8.05,1,0,1


In [21]:
X_train, X_validate, X_test, y_train, y_validate, y_test = prepare.split_data(df_titanic, 'survived')

# print the shapes of the resulting datasets
print(f'Training set shape: {X_train.shape}, {y_train.shape}')
print(f'Validation set shape: {X_validate.shape}, {y_validate.shape}')
print(f'Testing set shape: {X_test.shape}, {y_test.shape}')


Training set shape: (498, 10), (498,)
Validation set shape: (214, 10), (214,)
Testing set shape: (179, 10), (179,)


In [23]:
from sklearn.ensemble import RandomForestClassifier

# create the classifier
rf = RandomForestClassifier(random_state=123, min_samples_leaf=1, max_depth=10)

# fit the classifier to the training data
rf.fit(X_train, y_train)

# make predictions on the training data
y_train_pred = rf.predict(X_train)

# compute the accuracy score
accuracy = accuracy_score(y_train, y_train_pred)

# print the accuracy score
print(f'Training accuracy: {accuracy}')


Training accuracy: 0.9759036144578314


In [26]:
# Evaluate your results using the model score, confusion matrix, and classification report.

# compute the accuracy score
accuracy = accuracy_score(y_train, y_train_pred)

# compute the confusion matrix
cm = confusion_matrix(y_train, y_train_pred)

# compute the classification report
cr = classification_report(y_train, y_train_pred)

# print the results
print(f'Training accuracy: {accuracy}')
print(f'Confusion matrix:\n{cm}')
print(f'Classification report:\n{cr}')



Training accuracy: 0.9759036144578314
Confusion matrix:
[[307   0]
 [ 12 179]]
Classification report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       307
           1       1.00      0.94      0.97       191

    accuracy                           0.98       498
   macro avg       0.98      0.97      0.97       498
weighted avg       0.98      0.98      0.98       498



In [27]:
# Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

# compute the predictions on the training data
y_train_pred = rf.predict(X_train)

# compute the accuracy score
accuracy = accuracy_score(y_train, y_train_pred)

# compute the confusion matrix
cm = confusion_matrix(y_train, y_train_pred)

# compute the true positive rate, false positive rate, true negative rate, and false negative rate
tn, fp, fn, tp = cm.ravel()
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)
tnr = tn / (tn + fp)
fnr = fn / (fn + tp)

# compute the precision, recall, and f1-score
cr = classification_report(y_train, y_train_pred, output_dict=True)
precision = cr['1']['precision']
recall = cr['1']['recall']
f1 = cr['1']['f1-score']
support = cr['1']['support']

# print the results
print(f'Accuracy: {accuracy:.2f}')
print(f'True positive rate: {tpr:.2f}')
print(f'False positive rate: {fpr:.2f}')
print(f'True negative rate: {tnr:.2f}')
print(f'False negative rate: {fnr:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')
print(f'Support: {support}')



Accuracy: 0.98
True positive rate: 0.94
False positive rate: 0.00
True negative rate: 1.00
False negative rate: 0.06
Precision: 1.00
Recall: 0.94
F1-score: 0.97
Support: 191


In [28]:
# Run through steps increasing your min_samples_leaf and decreasing your max_depth.

# create the classifier
rf2 = RandomForestClassifier(random_state=123, min_samples_leaf=5, max_depth=5)

# fit the classifier to the training data
rf2.fit(X_train, y_train)

# make predictions on the training data
y_train_pred2 = rf2.predict(X_train)

# compute the accuracy score
accuracy2 = accuracy_score(y_train, y_train_pred2)

# print the accuracy score
print(f'Training accuracy (rf2): {accuracy2}')



Training accuracy (rf2): 0.8614457831325302


In [29]:
# What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

# compute the predictions on the training data
y_train_pred2 = rf2.predict(X_train)

# compute the accuracy score
accuracy2 = accuracy_score(y_train, y_train_pred2)

# compute the confusion matrix
cm2 = confusion_matrix(y_train, y_train_pred2)

# compute the true positive rate, false positive rate, true negative rate, and false negative rate
tn2, fp2, fn2, tp2 = cm2.ravel()
tpr2 = tp2 / (tp2 + fn2)
fpr2 = fp2 / (fp2 + tn2)
tnr2 = tn2 / (tn2 + fp2)
fnr2 = fn2 / (fn2 + tp2)

# compute the precision, recall, and f1-score
cr2 = classification_report(y_train, y_train_pred2, output_dict=True)
precision2 = cr2['1']['precision']
recall2 = cr2['1']['recall']
f1_2 = cr2['1']['f1-score']
support2 = cr2['1']['support']

# print the results
print(f'Accuracy (rf2): {accuracy2:.2f}')
print(f'True positive rate (rf2): {tpr2:.2f}')
print(f'False positive rate (rf2): {fpr2:.2f}')
print(f'True negative rate (rf2): {tnr2:.2f}')
print(f'False negative rate (rf2): {fnr2:.2f}')
print(f'Precision (rf2): {precision2:.2f}')
print(f'Recall (rf2): {recall2:.2f}')
print(f'F1-score (rf2): {f1_2:.2f}')
print(f'Support (rf2): {support2}')


# The output of this code should show that the training accuracy, precision, recall, and f1-score of `rf2` are all lower than those of `rf`,
# while the false positive rate and false negative rate are higher. The support value is the same since it is based on the number of instances
# of the positive class in the training data.

# The reason why `rf2` has lower performance on the in-sample data is that the increased `min_samples_leaf` and decreased `max_depth`
# values have made the model less complex and less flexible, which can lead to underfitting. Underfitting occurs when the model is
# too simple to capture the complexity of the training data, resulting in poor performance on both the training and test data. In this case,
# `rf2` is likely underfitting the training data, leading to lower accuracy and other metrics.

# It's important to note that the evaluation metrics on the training data are not necessarily indicative of the performance of the models on new,
# unseen data. It's important to evaluate the models on a separate validation or testing dataset to get a better sense of their generalization
# performance.


Accuracy (rf2): 0.86
True positive rate (rf2): 0.73
False positive rate (rf2): 0.06
True negative rate (rf2): 0.94
False negative rate (rf2): 0.27
Precision (rf2): 0.89
Recall (rf2): 0.73
F1-score (rf2): 0.80
Support (rf2): 191


In [39]:
# After making a few models, which one has the best performance (or closest metrics) on both train and validate?
# create a list of tuples containing the name and model object for each model
# Based on the output, it looks like the first model (rf) has the best performance on both the train and validate data sets,
# with a training accuracy of 0.9759 and similar metrics on the validation set. The second model (rf2) has a lower training accuracy of 0.8614.

# create a list of tuples containing the name and model object for each model
models = [
    ('rf', rf),
    ('rf2', rf2)
]

# evaluate each model on the train and validate data sets
for name, model in models:
    # make predictions on the train data
    y_train_pred = model.predict(X_train)
    
    # make predictions on the validate data
    y_validate_pred = model.predict(X_validate)
    
    # compute the accuracy score for train and validate data sets
    train_accuracy = accuracy_score(y_train, y_train_pred)
    validate_accuracy = accuracy_score(y_validate, y_validate_pred)
    
    # print the accuracy scores for train and validate data sets
    print(f'{name} train accuracy: {train_accuracy:.4f}')
    print(f'{name} validate accuracy: {validate_accuracy:.4f}')




rf train accuracy: 0.9759
rf validate accuracy: 0.7991
rf2 train accuracy: 0.8614
rf2 validate accuracy: 0.8178
