- [x] Create a new notebook, `random_forests`, and work with titanic data to do the following:

In [2]:
# Import full libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import personal modules
import acquire as a
import prepare as p
import model as m

# Selective imports
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

# Handle warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
# import titanic
raw_titanic = a.get_titanic_data()
raw_titanic.head()

Reading from file...


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [4]:
a.df_info(raw_titanic)

Unnamed: 0,nunique,dtypes,isnull,sample
passenger_id,891,int64,0,782
survived,2,int64,0,0
pclass,3,int64,0,1
sex,2,object,0,male
age,88,float64,177,29.0
sibsp,7,int64,0,0
parch,7,int64,0,0
fare,248,float64,0,30.0
embarked,3,object,2,S
class,3,object,0,First


In [5]:
# Clean titanic
titanic = p.prep_titanic(raw_titanic)
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,Southampton,1


In [6]:
a.df_info(titanic)

Unnamed: 0,nunique,dtypes,isnull,sample
passenger_id,891,object,0,145
survived,2,object,0,0
pclass,3,object,0,2
sex,2,object,0,male
age,91,float64,0,19.0
sibsp,7,object,0,1
parch,7,object,0,1
fare,248,float64,0,36.75
embark_town,3,object,0,Southampton
alone,2,object,0,0


In [7]:
# Split titanic (and add these to a list for iterative capabilities)
df_sets = train,validate,test = p.split_df(titanic,'survived')
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
776,776,0,3,male,25.14,0,0,7.75,Queenstown,1
829,829,1,1,female,62.0,0,0,80.0,Southampton,1
215,215,1,1,female,31.0,1,0,113.275,Cherbourg,0
258,258,1,1,female,35.0,0,0,512.3292,Cherbourg,1
129,129,0,3,male,45.0,0,0,6.975,Southampton,1


In [8]:
a.df_info(train)

Unnamed: 0,nunique,dtypes,isnull,sample
passenger_id,534,object,0,758
survived,2,object,0,0
pclass,3,object,0,3
sex,2,object,0,male
age,79,float64,0,34.0
sibsp,7,object,0,0
parch,6,object,0,0
fare,194,float64,0,8.05
embark_town,3,object,0,Southampton
alone,2,object,0,1


In [9]:
# show shapes
for df in df_sets:
    print(f'Shape: {df.shape}')

Shape: (534, 10)
Shape: (178, 10)
Shape: (179, 10)


In [10]:
# Encode titanic
encoded = train_encoded,val_encoded,test_encoded = [m.preprocess_titanic(df) for df in df_sets]
train_encoded.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,is_male,is_Queenstown,is_Southampton
776,0.0,3.0,25.14,0.0,0.0,7.75,1.0,1.0,1.0,0.0
829,1.0,1.0,62.0,0.0,0.0,80.0,1.0,0.0,0.0,1.0
215,1.0,1.0,31.0,1.0,0.0,113.275,0.0,0.0,0.0,0.0
258,1.0,1.0,35.0,0.0,0.0,512.3292,1.0,0.0,0.0,0.0
129,0.0,3.0,45.0,0.0,0.0,6.975,1.0,1.0,0.0,1.0


In [11]:
# show new shapes
for df in encoded:
    print(f'Shape: {df.shape}')

Shape: (534, 10)
Shape: (178, 10)
Shape: (179, 10)


In [12]:
# verify datatypes on encoded train
a.df_info(train_encoded)

Unnamed: 0,nunique,dtypes,isnull,sample
survived,2,float64,0,0.0
pclass,3,float64,0,3.0
age,79,float64,0,25.14
sibsp,7,float64,0,0.0
parch,6,float64,0,0.0
fare,194,float64,0,7.225
alone,2,float64,0,1.0
is_male,2,float64,0,1.0
is_Queenstown,2,float64,0,0.0
is_Southampton,2,float64,0,0.0


In [13]:
# get X's
X_sets = X_train,X_validate,X_test = [df.drop(columns='survived') for df in encoded]
X_train.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,alone,is_male,is_Queenstown,is_Southampton
776,3.0,25.14,0.0,0.0,7.75,1.0,1.0,1.0,0.0
829,1.0,62.0,0.0,0.0,80.0,1.0,0.0,0.0,1.0
215,1.0,31.0,1.0,0.0,113.275,0.0,0.0,0.0,0.0
258,1.0,35.0,0.0,0.0,512.3292,1.0,0.0,0.0,0.0
129,3.0,45.0,0.0,0.0,6.975,1.0,1.0,0.0,1.0


In [14]:
# get y's
y_sets = y_train,y_validate,y_test = [df.survived for df in encoded]
y_train.head()

776    0.0
829    1.0
215    1.0
258    1.0
129    0.0
Name: survived, dtype: float64

1. [X] Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [16]:
# Make the forest
rf1 = RandomForestClassifier(min_samples_leaf=1,max_depth=10,random_state=123)
rf1

In [17]:
# fit the data
rf1.fit(X_train,y_train)

In [18]:
# predict the data
rf1.predict(X_train)[:10]

array([0., 1., 1., 1., 0., 0., 1., 0., 1., 0.])

In [19]:
# score the training
rf1.score(X_train,y_train)

0.9681647940074907

2. [x] Evaluate your results using the model score, confusion matrix, and classification report.

In [21]:
# Evaluate model score
rf1.score(X_train,y_train)

0.9681647940074907

In [23]:
# Evaluate confusion matrix
metrics.confusion_matrix(y_train,rf1.predict(X_train))

array([[327,   2],
       [ 15, 190]])

In [25]:
# evaluate confusion matrix (normalized)
metrics.confusion_matrix(y_train,rf1.predict(X_train),normalize='true')

array([[0.99392097, 0.00607903],
       [0.07317073, 0.92682927]])

In [27]:
# run classification report
print(metrics.classification_report(y_train,rf1.predict(X_train)))

              precision    recall  f1-score   support

         0.0       0.96      0.99      0.97       329
         1.0       0.99      0.93      0.96       205

    accuracy                           0.97       534
   macro avg       0.97      0.96      0.97       534
weighted avg       0.97      0.97      0.97       534



3. [x] Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [132]:
# get rates
top,bottom = metrics.confusion_matrix(y_train,rf1.predict(X_train),normalize='true')
tn,fp = top
fn,tp = bottom

print(f"""
True Negative Rate: {tn}
False Positive Rate: {fp}
False Negative Rate: {fn}
True Positive Rate: {tp}

Classification Report:
{metrics.classification_report(y_train,rf1.predict(X_train))}
""")


True Negative Rate: 0.993920972644377
False Positive Rate: 0.0060790273556231
False Negative Rate: 0.07317073170731707
True Positive Rate: 0.926829268292683

Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.97       329
         1.0       0.99      0.93      0.96       205

    accuracy                           0.97       534
   macro avg       0.97      0.96      0.97       534
weighted avg       0.97      0.97      0.97       534




In [130]:
pd.crosstab(y_train,rf1.predict(X_train),normalize='index')

col_0,0.0,1.0
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.993921,0.006079
1.0,0.073171,0.926829


4. [x] Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [87]:
# set up variables to iterate
depth = 10
leaves = 1
forest_list = []

for i in range(10):
    # verify variables will increase/decrease as necessary
    # print(f'depth: {depth}')
    # print(f'leaves: {leaves}')
    forest_list.append(RandomForestClassifier(max_depth=depth,min_samples_leaf=leaves))
    
    depth -= 1
    leaves += 1
    
print(forest_list)

[RandomForestClassifier(max_depth=10), RandomForestClassifier(max_depth=9, min_samples_leaf=2), RandomForestClassifier(max_depth=8, min_samples_leaf=3), RandomForestClassifier(max_depth=7, min_samples_leaf=4), RandomForestClassifier(max_depth=6, min_samples_leaf=5), RandomForestClassifier(max_depth=5, min_samples_leaf=6), RandomForestClassifier(max_depth=4, min_samples_leaf=7), RandomForestClassifier(max_depth=3, min_samples_leaf=8), RandomForestClassifier(max_depth=2, min_samples_leaf=9), RandomForestClassifier(max_depth=1, min_samples_leaf=10)]


In [91]:
# Fit the models
for i in range(10):
    # print(forest_list[i])
    
    # rf1.fit() # to see what I need to work with
    forest_list[i].fit(X_train,y_train)
    

In [103]:
for i in range(10):
    print(f'tree {i} accuracy score: {forest_list[i].score(X_train,y_train)}')

tree 0 accuracy score: 0.9662921348314607
tree 1 accuracy score: 0.9269662921348315
tree 2 accuracy score: 0.9157303370786517
tree 3 accuracy score: 0.8951310861423221
tree 4 accuracy score: 0.8689138576779026
tree 5 accuracy score: 0.8707865168539326
tree 6 accuracy score: 0.8558052434456929
tree 7 accuracy score: 0.8445692883895131
tree 8 accuracy score: 0.7940074906367042
tree 9 accuracy score: 0.7921348314606742


In [105]:
for i in range(10):
    print(f'tree {i} classification report:\n {metrics.classification_report(y_train,forest_list[i].predict(X_train))}\n\n')

tree 0 classification report:
               precision    recall  f1-score   support

         0.0       0.95      1.00      0.97       329
         1.0       0.99      0.92      0.95       205

    accuracy                           0.97       534
   macro avg       0.97      0.96      0.96       534
weighted avg       0.97      0.97      0.97       534



tree 1 classification report:
               precision    recall  f1-score   support

         0.0       0.91      0.98      0.94       329
         1.0       0.96      0.85      0.90       205

    accuracy                           0.93       534
   macro avg       0.93      0.91      0.92       534
weighted avg       0.93      0.93      0.93       534



tree 2 classification report:
               precision    recall  f1-score   support

         0.0       0.90      0.97      0.93       329
         1.0       0.94      0.83      0.88       205

    accuracy                           0.92       534
   macro avg       0.92      0.

5. [x] What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

> For most of the metrics, they tend to go down as there's a decrease in depth and increase in leaves

> Models 0-3 look to evaluate the best, probably because they have the deepest depth and therefore might be overfitting.

After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [134]:
# run through and compare accuracies
for i in range(10):
    print(f'tree {i} train score: {forest_list[i].score(X_train,y_train)}')
    print(f'tree {i} validate score: {forest_list[i].score(X_validate,y_validate)}')
    print(f'tree {i} difference: {forest_list[i].score(X_train,y_train) - forest_list[i].score(X_validate,y_validate)}')
    print()

tree 0 train score: 0.9662921348314607
tree 0 validate score: 0.7865168539325843
tree 0 difference: 0.1797752808988764

tree 1 train score: 0.9269662921348315
tree 1 validate score: 0.7921348314606742
tree 1 difference: 0.1348314606741573

tree 2 train score: 0.9157303370786517
tree 2 validate score: 0.7696629213483146
tree 2 difference: 0.1460674157303371

tree 3 train score: 0.8951310861423221
tree 3 validate score: 0.7640449438202247
tree 3 difference: 0.13108614232209737

tree 4 train score: 0.8689138576779026
tree 4 validate score: 0.7752808988764045
tree 4 difference: 0.09363295880149813

tree 5 train score: 0.8707865168539326
tree 5 validate score: 0.7696629213483146
tree 5 difference: 0.101123595505618

tree 6 train score: 0.8558052434456929
tree 6 validate score: 0.7865168539325843
tree 6 difference: 0.06928838951310856

tree 7 train score: 0.8445692883895131
tree 7 validate score: 0.7752808988764045
tree 7 difference: 0.06928838951310856

tree 8 train score: 0.794007490636704

> based on these results, I'm going to say that I want only the models with a difference less than 10% and the highest scores

> These will be models 4, 6, 7, 8, and 9

> However, the best of these seem to be model 6, which had an 85 on train and 78 on validate

In [144]:
forest_list[6].score(X_test,y_test)

0.8324022346368715

In [142]:
print(metrics.classification_report(y_test,forest_list[6].predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.82      0.94      0.87       110
         1.0       0.87      0.67      0.75        69

    accuracy                           0.83       179
   macro avg       0.84      0.80      0.81       179
weighted avg       0.84      0.83      0.83       179



In [None]:
pd.crosstab(y_test,)