# Random Forests Exercise

In [23]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

import prepare

In [8]:
titanic = prepare.prep_titanic()

# creat baseline colum
titanic["baseline"] = [int(titanic.survived.mode()) for i in range(len(titanic))]

titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S,baseline
0,0,0,3,male,22.0,1,0,7.25,S,0,1,0,1,0
1,1,1,1,female,38.0,1,0,71.2833,C,0,0,0,0,0
2,2,1,3,female,26.0,0,0,7.925,S,1,0,0,1,0
3,3,1,1,female,35.0,1,0,53.1,S,0,0,0,1,0
4,4,0,3,male,35.0,0,0,8.05,S,1,1,0,1,0


In [9]:
train, validate, test = prepare.split_data_(df=titanic, stratify_col="survived", random_state=95)
len(train), len(validate), len(test)

(534, 178, 179)

In [14]:
# separate features from target
xTrain = train.drop(columns=["passenger_id", 'survived', "age", 'sex', 'embarked', 'baseline'])
yTrain = train.survived

# validate
xVal = validate.drop(columns=["passenger_id", 'survived', "age", 'sex', 'embarked', 'baseline'])
yVal= validate.survived

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [19]:
# create random forest object
randFor = RandomForestClassifier(n_estimators=100, random_state=95)
randFor

In [20]:
# fit model
randFor = randFor.fit(xTrain, yTrain)
randFor

**feature importance**

In [24]:
# use the model to predict 
importance = randFor.feature_importances_
importance

array([0.09493788, 0.0603581 , 0.06677171, 0.38241401, 0.02724007,
       0.32360872, 0.01509969, 0.02956983])

In [29]:
# now make the prediction
yPred = randFor.predict(xTrain)
yTrain[:5]

84     1
138    0
369    1
577    1
212    0
Name: survived, dtype: int64

In [32]:
# probability of prediction
ypred_proba = randFor.predict_proba(xTrain)
ypred_proba[:5]

array([[0.20606061, 0.79393939],
       [0.99888889, 0.00111111],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [1.        , 0.        ]])

In [34]:
# 1 == survived
# 0 == didn't make it
randFor.classes_

array([0, 1])

2. Evaluate your results using the model score, confusion matrix, and classification report.

In [40]:
# accuracy score
accuracy = randFor.score(xTrain, yTrain)
accuracy

0.9400749063670412

In [42]:
# confusion matrix
conMat = confusion_matrix(yTrain, yPred)
conMat

array([[318,  11],
       [ 21, 184]])

In [44]:
# classification report
report = classification_report(yTrain, yPred)
print(report)

              precision    recall  f1-score   support

           0       0.94      0.97      0.95       329
           1       0.94      0.90      0.92       205

    accuracy                           0.94       534
   macro avg       0.94      0.93      0.94       534
weighted avg       0.94      0.94      0.94       534



3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [None]:
for i in range(3:100):
    randFor.

5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?