# Random Forest

work with titanic data 

# 1
Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import acquire
from prepare import prep_titanic
from prepare import my_train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


In [2]:
df = prep_titanic()

In [3]:
df

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,1,0,7.2500,Southampton,0,1,0,1
1,1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,2,1,3,female,0,0,7.9250,Southampton,1,0,0,1
3,3,1,1,female,1,0,53.1000,Southampton,0,0,0,1
4,4,0,3,male,0,0,8.0500,Southampton,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,0,0,13.0000,Southampton,1,1,0,1
887,887,1,1,female,0,0,30.0000,Southampton,1,0,0,1
888,888,0,3,female,1,2,23.4500,Southampton,0,0,0,1
889,889,1,1,male,0,0,30.0000,Cherbourg,1,1,0,0


In [4]:
my_train_test_split(df, target='survived')
train, validate, test = my_train_test_split(df, 'survived')

In [5]:
# create split, dropping target
x_train = train.drop(columns=['survived','sex','embark_town'])
y_train = train.survived

x_val = validate.drop(columns=['survived','sex','embark_town'])
y_val = validate.survived

x_test = test.drop(columns=['survived','sex','embark_town'])
y_test = test.survived

# create object
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

In [6]:
rf
rf.fit(x_train, y_train)

In [7]:
# make predictions
y_pred = rf.predict(x_train)
# estimate probability of survive
y_pred_proba = rf.predict_proba(x_train)

In [8]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(x_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


# 2
Evaluate your results using the model score, confusion matrix, and classification report.

In [9]:
print(confusion_matrix(y_train, y_pred))

[[329   0]
 [ 17 188]]


In [10]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       329
           1       1.00      0.92      0.96       205

    accuracy                           0.97       534
   macro avg       0.98      0.96      0.97       534
weighted avg       0.97      0.97      0.97       534



# 3
Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [11]:
TN, FP, FN, TP = confusion_matrix(y_train, y_pred).ravel()
TN, FP, FN, TP

(329, 0, 17, 188)

In [12]:
accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)

TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
TNR = TN / (FP + TN)
FNR = FN / (TP + FN)
F1 = 2 * (precision * recall) / (precision + recall)
support = TP+FN
support_ = FP+TN

print('The description of this model is:')
print('Accuracy: {}'.format(round(accuracy,2)))
print('Precision: {}'.format(round(precision,2)))
print('Recall: {}'.format(round(recall,2)))
print('True pos. rate: {}'.format(round(TPR,2)))
print('False pos. rate: {}'.format(round(FPR,2)))
print('True neg. rate: {}'.format(round(TNR,2)))
print('False neg. rate: {}'.format(round(FNR,2)))
print('F1 score: {}'.format(round(F1,2)))
print('Support pos: {}'.format(support))
print('Support neg: {}'.format(support_))

The description of this model is:
Accuracy: 0.97
Precision: 1.0
Recall: 0.92
True pos. rate: 0.92
False pos. rate: 0.0
True neg. rate: 1.0
False neg. rate: 0.08
F1 score: 0.96
Support pos: 205
Support neg: 329


# 4
Run through steps increasing your min_samples_leaf and decreasing your max_depth. 

In [15]:
# fx for iteration over change in depth, leaf
def iterate_rf(rf, iterations, x, y):
    for i in range(iterations):
        # Create a new RandomForestClassifier object in each iteration
        new_rf = RandomForestClassifier(max_depth=rf.max_depth - i, min_samples_leaf=rf.min_samples_leaf + i)
        new_rf.fit(x, y)  # Fit the new RF model with the updated parameters
        print(f"Iteration {i+1} - Max Depth: {new_rf.max_depth}, Min Samples Leaf: {new_rf.min_samples_leaf}")
        print("Classification Report:\n", classification_report(y, new_rf.predict(x)))

num_iterations = 10

# iterate over object and display classification report
iterate_rf(rf, num_iterations, x_train, y_train)

Iteration 1 - Max Depth: 10, Min Samples Leaf: 1
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       329
           1       1.00      0.91      0.95       205

    accuracy                           0.97       534
   macro avg       0.97      0.96      0.96       534
weighted avg       0.97      0.97      0.97       534

Iteration 2 - Max Depth: 9, Min Samples Leaf: 2
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95       329
           1       0.97      0.85      0.91       205

    accuracy                           0.93       534
   macro avg       0.94      0.92      0.93       534
weighted avg       0.94      0.93      0.93       534

Iteration 3 - Max Depth: 8, Min Samples Leaf: 3
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.97      0.91       329
           1       0

In [16]:
# non-functioning/older version

# create object
#def iterate_rf(rf, iterations, x, y):
#    for i in range(iterations):
#        rf = RandomForestClassifier()
#        rf.set_params(max_depth=rf.max_depth - i, min_samples_leaf=rf.min_samples_leaf + i)
#        rf.fit(x_train, y_train)
#        print(f"Iteration {i+1} - Max Depth: {rf.max_depth}, Min Samples Leaf: {rf.min_samples_leaf}")
#        print("Classification Report:\n", classification_report(y, rf.predict(x)))
        
#num_iterations = 10

# iterate over object and display classification report
#iterate_rf(rf, num_iterations, x_train, y_train)

# 5
What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

ANSWER: Model1(Max Depth: 10, Min Samples Leaf: 1) is optimal with the subsequent models decreasing in value of all metrics. 