## Create a new notebook, random_forests, and work with titanic data to do the following:

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from pydataset import data

import acquire
import prepare

In [2]:
titanic = acquire.get_titanic_data()
titanic.head(3)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1


In [3]:
train, validate, test = prepare.clean_titanic(titanic)

In [4]:
train.shape, validate.shape, test.shape

((498, 12), (214, 12), (179, 12))

In [5]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
271,1,3,male,25.0,0,0,0.0,Southampton,1,True,False,True
786,1,3,female,18.0,0,0,7.4958,Southampton,1,False,False,True
86,0,3,male,16.0,1,3,34.375,Southampton,0,True,False,True
353,0,3,male,25.0,1,0,17.8,Southampton,0,True,False,True
199,0,2,female,24.0,0,0,13.0,Southampton,1,False,False,True


In [6]:
train['baseline'] = 0
baseline_accuracy = (train.baseline == train.survived).mean()
baseline_accuracy

0.6164658634538153

In [7]:
# SPLIT DATA INTO X AND Y DATA SETS
X_train = train.drop(columns=['survived', 'sex', 'embark_town', 'baseline'])

X_validate = validate.drop(columns=['survived','sex','embark_town'])

X_test = test.drop(columns=['survived','sex','embark_town'])

y_train = train.survived

y_validate = validate.survived 

y_test = test.survived


### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.



In [8]:
rf = RandomForestClassifier(random_state=1108, min_samples_leaf=1, max_depth=10)

In [9]:
tree1 = rf.fit(X_train, y_train)

### 2. Evaluate your results using the model score, confusion matrix, and classification report.



In [10]:
rf1_acc = rf.score(X_train, y_train)

In [11]:
y_pred = rf.predict(X_train)

In [12]:
pd.DataFrame(
    confusion_matrix(y_train, y_pred),
    columns=['pred_0', 'pred_1'],
    index=['actual_0', 'actual_1'])

Unnamed: 0,pred_0,pred_1
actual_0,305,2
actual_1,20,171


In [13]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.99      0.97       307
           1       0.99      0.90      0.94       191

    accuracy                           0.96       498
   macro avg       0.96      0.94      0.95       498
weighted avg       0.96      0.96      0.96       498



### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



In [14]:
def compute_metrics(TN,FP,FN,TP):
    all_ = (TP + TN + FP + FN)

    accuracy = (TP + TN) / all_

    TPR = recall = TP / (TP + FN)
    FPR = FP / (FP + TN)

    TNR = TN / (FP + TN)
    FNR = FN / (FN + TP)

    precision =  TP / (TP + FP)
    f1 =  2 * ((precision * recall) / ( precision + recall))

    support_pos = TP + FN
    support_neg = FP + TN

    print(f"Accuracy: {accuracy}\n")
    print(f"True Positive Rate/Sensitivity/Recall/Power: {TPR}")
    print(f"False Positive Rate/False Alarm Ratio/Fall-out: {FPR}")
    print(f"True Negative Rate/Specificity/Selectivity: {TNR}")
    print(f"False Negative Rate/Miss Rate: {FNR}\n")
    print(f"Precision/PPV: {precision}")
    print(f"F1 Score: {f1}\n")
    print(f"Support (0): {support_pos}")
    print(f"Support (1): {support_neg}")

In [15]:
confu = confusion_matrix(y_train,y_pred)
TN, FP, FN, TP = confu.ravel()
TN, FP, FN, TP 

(305, 2, 20, 171)

In [16]:
compute_metrics(TN, FP, FN, TP)

Accuracy: 0.9558232931726908

True Positive Rate/Sensitivity/Recall/Power: 0.8952879581151832
False Positive Rate/False Alarm Ratio/Fall-out: 0.006514657980456026
True Negative Rate/Specificity/Selectivity: 0.993485342019544
False Negative Rate/Miss Rate: 0.10471204188481675

Precision/PPV: 0.9884393063583815
F1 Score: 0.9395604395604397

Support (0): 191
Support (1): 307


### 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.



In [17]:
rf2 = RandomForestClassifier(random_state=1108, min_samples_leaf=3, max_depth=7)

In [18]:
tree2 = rf2.fit(X_train, y_train)

In [19]:
rf2_acc = rf2.score(X_train, y_train)

In [20]:
y_pred2 = rf2.predict(X_train)
y_pred2[:5]

array([0, 1, 0, 0, 1])

In [21]:
pd.DataFrame(
    confusion_matrix(y_train, y_pred2),
    columns=['pred_0', 'pred_1'],
    index=['actual_0', 'actual_1'])

Unnamed: 0,pred_0,pred_1
actual_0,294,13
actual_1,47,144


In [22]:
print(classification_report(y_train, y_pred2))

              precision    recall  f1-score   support

           0       0.86      0.96      0.91       307
           1       0.92      0.75      0.83       191

    accuracy                           0.88       498
   macro avg       0.89      0.86      0.87       498
weighted avg       0.88      0.88      0.88       498



### 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?



In [23]:
print(f'Tree one is {rf1_acc}, Tree two is {rf2_acc}, so Tree one is a better fit with the train data. Probably\
 because it has less leaf points and more depth')

Tree one is 0.9558232931726908, Tree two is 0.8795180722891566, so Tree one is a better fit with the train data. Probably because it has less leaf points and more depth


### After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [28]:
print(f'Tree one validate {rf.score(X_validate, y_validate)}, a {rf1_acc - rf.score(X_validate, y_validate)} difference')
print('~~~~~~~~~~~~~~~~~~~~~~~')
print(f'Tree two validate {rf2.score(X_validate, y_validate)}, a {rf2_acc - rf2.score(X_validate, y_validate)} difference')
print('So the 2nd tree test is the better fit')

Tree one validate 0.794392523364486, a 0.1614307698082048 difference
~~~~~~~~~~~~~~~~~~~~~~~
Tree two validate 0.7990654205607477, a 0.08045265172840887 difference
So the 2nd tree test is the better fit
