In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from prepare import titanic_pipeline

In [2]:
train, val, test = titanic_pipeline()
train.shape, val.shape, test.shape


((623, 9), (134, 9), (134, 9))

In [3]:
train.head()

Unnamed: 0,survived,sex,age,sibsp,parch,fare,class,embark_town,alone
748,0,male,19.0,1,0,53.1,First,Southampton,0
45,0,male,29.0,0,0,8.05,Third,Southampton,1
28,1,female,29.0,0,0,7.8792,Third,Queenstown,1
633,0,male,29.0,0,0,0.0,First,Southampton,1
403,0,male,28.0,1,0,15.85,Third,Southampton,0


In [4]:
X_train = train.drop(columns = 'survived')
y_train = train.survived

X_val = val.drop(columns = 'survived')
y_val = val.survived

In [5]:
X_train = pd.get_dummies(X_train)
X_val = pd.get_dummies(X_val)

X_train.head()

Unnamed: 0,age,sibsp,parch,fare,alone,sex_female,sex_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
748,19.0,1,0,53.1,0,False,True,True,False,False,False,False,True
45,29.0,0,0,8.05,1,False,True,False,False,True,False,False,True
28,29.0,0,0,7.8792,1,True,False,False,False,True,False,True,False
633,29.0,0,0,0.0,1,False,True,True,False,False,False,False,True
403,28.0,1,0,15.85,0,False,True,False,False,True,False,False,True


In [6]:
(y_train == 0).mean()

0.6163723916532905

In [7]:
seed = 42

rf = RandomForestClassifier(max_depth = 10, random_state = seed)

rf.fit(X_train, y_train)

In [8]:
rf.score(X_train, y_train), rf.score(X_val, y_val)

(0.9534510433386838, 0.8507462686567164)

In [12]:
#Make the predictions!
train_preds = rf.predict(X_train)
train_preds[:10]

array([0, 0, 1, 0, 0, 1, 1, 0, 0, 0])

In [15]:
print(classification_report(y_train, train_preds))

              precision    recall  f1-score   support

           0       0.94      0.99      0.96       384
           1       0.98      0.90      0.94       239

    accuracy                           0.95       623
   macro avg       0.96      0.94      0.95       623
weighted avg       0.95      0.95      0.95       623



In [16]:
#Create the confusion matrix from the y_train (catual target variable) and the train predictions
conf_matrix = confusion_matrix(y_train, train_preds)
print(conf_matrix)
#assign a variable to each outcome to be used on later calculations
tn, fp, fn, tp = conf_matrix.ravel()
print(tn, fp, fn, tp)

[[379   5]
 [ 24 215]]
379 5 24 215


In [17]:
#Calculate tpr, fpr, tnr,fnr
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)
tnr = tn / (tn + fp)
fnr = fn / (fn + tp)
#display the rates
print(f"True Positive Rate    {tpr:.4f}")
print(f"False Positive Rate   {fpr:.4f}")
print(f"True Negative Rate    {tnr:.4f}")
print(f"False Negative Rate   {fnr:.4f}")

True Positive Rate    0.8996
False Positive Rate   0.0130
True Negative Rate    0.9870
False Negative Rate   0.1004


In [63]:
#Test a combination of depths and min leaf settings. 
seed = 42
train_acc = []
val_acc = []
depth = []
leaf = []

for max_depth in range(11, 0, -1):  # Decreasing depth from 10 to 1
    for min_samples_leaf in range(1, 6):  # Increasing min_samples_leaf from 1 to 5
        rf = RandomForestClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf, random_state=seed)
        rf.fit(X_train, y_train)
        
        depth.append(max_depth)
        leaf.append(min_samples_leaf)
        
        train_acc.append(rf.score(X_train, y_train))
        val_acc.append(rf.score(X_val, y_val))


In [67]:
#Print the results
trees = pd.DataFrame({'depth': depth,
                      'train_acc': train_acc,
                      'val_acc': val_acc,
                      'leaf': leaf})

trees.sort_values(by=['val_acc', 'train_acc', 'depth'], ascending=[False, False, True]).head(5)

Unnamed: 0,depth,train_acc,val_acc,leaf
10,9,0.9374,0.858209,1
1,11,0.913323,0.858209,2
6,10,0.905297,0.858209,2
7,10,0.894061,0.858209,3
2,11,0.886035,0.858209,3


In [None]:
#seems like depth 11 and leaf 2 is the best