# Random Forests Exercises

Create a new notebook, random_forests, and work with titanic data to do the following:

In [1]:
#data manipulation
import pandas as pd
import numpy as np

#visualization
import matplotlib.pyplot as plt
import seaborn as sns

#stats is great
from scipy import stats

#my own files with my own functions
import acquire
import prepare

# os is operating system stuff, few things I know
# env is my py file to access SQL databases
import os
import env

# If I decide to retrieve other datasets but they'll be raw
from pydataset import data

# ML stuff: (modeling imports)
from sklearn.model_selection import train_test_split

# The big 4 for classification
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression #logistic not linear!
from sklearn.neighbors import KNeighborsClassifier #pick the classifier one

# Evaluation metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
df = acquire.get_titanic_data()

this file exists, reading csv


In [3]:
df = prepare.clean_titanic(df)

In [4]:
train, validate, test = prepare.splitting_data(df, 'survived', seed=123)

In [5]:
train, validate, test = prepare.preprocess_titanic(train, validate, test)

In [6]:
### We want everything EXCEPT the target variable
X_train = train.drop(columns = 'survived')
X_validate = validate.drop(columns = 'survived')
X_test = test.drop(columns = 'survived')

In [7]:
### We want ONLY the target variable
y_train = train.survived
y_validate = validate.survived
y_test = test.survived

### Operations above ^ are from decision tree and will be used for the rest of the Machine Learning Models/Ensemble Methods

## 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [36]:
rf = RandomForestClassifier(min_samples_leaf=1, max_depth=10, random_state=123)

In [37]:
rf.fit(X_train, y_train)

In [38]:
y_pred = rf.predict(X_train)
y_pred[:10]

array([0, 0, 0, 0, 1, 1, 1, 0, 0, 0])

## 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [11]:
rf.score(X_train, y_train)

0.9601873536299765

In [13]:
confusion_matrix(y_train, y_pred) #rows, columns

array([[254,   0],
       [ 17, 156]])

In [12]:
pd.crosstab(y_train, y_pred)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,254,0
1,17,156


In [18]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       254
           1       1.00      0.90      0.95       173

    accuracy                           0.96       427
   macro avg       0.97      0.95      0.96       427
weighted avg       0.96      0.96      0.96       427



## 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [19]:
prepare.compute_class_metrics(y_train, y_pred)

Accuracy: 0.9601873536299765

True Positive Rate/Sensitivity/Recall/Power: 0.9017341040462428
False Positive Rate/False Alarm Ratio/Fall-out: 0.0
True Negative Rate/Specificity/Selectivity: 1.0
False Negative Rate/Miss Rate: 0.09826589595375723

Precision/PPV: 1.0
F1 Score: 0.9483282674772037

Support (0): 173
Support (1): 254


## 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [31]:
for x in range(1,11):
    rf = RandomForestClassifier(min_samples_leaf=x, max_depth=11-x, random_state=123)

    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_train)

    acc = rf.score(X_train, y_train)
    
    print(f'for min leaf samples = {x} and max depth = {11-x}, the accuracy is {round(acc,2)}')

for min leaf samples = 1 and max depth = 10, the accuracy is 0.96
for min leaf samples = 2 and max depth = 9, the accuracy is 0.92
for min leaf samples = 3 and max depth = 8, the accuracy is 0.89
for min leaf samples = 4 and max depth = 7, the accuracy is 0.88
for min leaf samples = 5 and max depth = 6, the accuracy is 0.87
for min leaf samples = 6 and max depth = 5, the accuracy is 0.85
for min leaf samples = 7 and max depth = 4, the accuracy is 0.84
for min leaf samples = 8 and max depth = 3, the accuracy is 0.82
for min leaf samples = 9 and max depth = 2, the accuracy is 0.81
for min leaf samples = 10 and max depth = 1, the accuracy is 0.79


## 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

> max depth of 10 and min leaf sample of 1 works best because it's asking the most questions of the data and requiring the least amount of samples

### After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [33]:
stats = []

for x in range(1,11):
    rf = RandomForestClassifier(min_samples_leaf=x, max_depth=11-x, random_state=123)

    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_train)

    acc = rf.score(X_train, y_train)
    acc_val = rf.score(X_validate, y_validate)
    
    stats.append([x, 11-x,round(acc,2),round(acc_val,2) ]) 
    
print(f'min_leaf_samples = {x}, max_depth = {11-x}, accuracy train = {round(acc,2)}, val = {round(acc_val,2)}')


min_leaf_samples = 10, max_depth = 1, accuracy train = 0.79, val = 0.76


In [34]:
stats_df = pd.DataFrame(stats, columns = ['min_leaf', 'max_depth', 'train_acc', 'val_acc'])
stats_df

Unnamed: 0,min_leaf,max_depth,train_acc,val_acc
0,1,10,0.96,0.8
1,2,9,0.92,0.82
2,3,8,0.89,0.82
3,4,7,0.88,0.8
4,5,6,0.87,0.8
5,6,5,0.85,0.8
6,7,4,0.84,0.8
7,8,3,0.82,0.77
8,9,2,0.81,0.75
9,10,1,0.79,0.76


In [35]:
stats_df.sort_values('val_acc', ascending=False)

Unnamed: 0,min_leaf,max_depth,train_acc,val_acc
1,2,9,0.92,0.82
2,3,8,0.89,0.82
0,1,10,0.96,0.8
3,4,7,0.88,0.8
4,5,6,0.87,0.8
5,6,5,0.85,0.8
6,7,4,0.84,0.8
7,8,3,0.82,0.77
9,10,1,0.79,0.76
8,9,2,0.81,0.75
