## Exercises

Continue working in your `model` file with titanic data to do the following: 

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

2. Evaluate your results using the model score, confusion matrix, and classification report.

3. Print and clearly label the following:  Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

4. Run through steps increasing your min_samples_leaf and decreasing your max_depth. 

5. What are the differences in the evaluation metrics?  Which performs better on your in-sample data?  Why?

After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings("ignore")

import acquire2
import prepare

## Acquire

In [2]:
df = acquire2.get_titanic_data()
df.head()

Using cached csv


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
df.survived.mean()

0.3838383838383838

### Prepare

In [4]:
#set passenger_id as index
df = df.set_index("passenger_id")

In [5]:
#check for nulls
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
deck           688
embark_town      2
alone            0
dtype: int64

In [6]:
#drop duplicate columns & columns w/ too many null values
df = df.drop(columns = ['class', 'embarked', 'deck'])

In [7]:
#fill null values in embark_town w/ mode

df.embark_town = df.embark_town.fillna(value = df.embark_town.mode())

In [8]:
#fill null age values w/ median age

df.age = df.age.fillna(value = df.age.median())

### Encode

In [9]:
dummy_df = pd.get_dummies(df[['sex', 'embark_town']], dummy_na = False, drop_first = [True, True])

#drop original columns that are being encoded
df = df.drop(columns = ['sex', 'embark_town'])

#stitch back together again
df = pd.concat([df, dummy_df], axis = 1)
df.head()

Unnamed: 0_level_0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,3,22.0,1,0,7.25,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0,0
2,1,3,26.0,0,0,7.925,1,0,0,1
3,1,1,35.0,1,0,53.1,0,0,0,1
4,0,3,35.0,0,0,8.05,1,1,0,1


In [11]:
#split the data

train , test = train_test_split(df, test_size = .2, random_state = 123, stratify = df.survived)
train, validate = train_test_split(train, test_size = .3, random_state = 123, stratify = train.survived)

In [12]:
train.shape

(498, 10)

In [13]:
validate.shape

(214, 10)

In [14]:
test.shape

(179, 10)

In [15]:
train.head()

Unnamed: 0_level_0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
583,0,1,36.0,0,0,40.125,1,1,0,0
165,1,3,9.0,0,2,20.525,0,1,0,1
50,0,3,7.0,4,1,39.6875,0,1,0,1
259,1,2,50.0,0,1,26.0,0,0,0,1
306,1,1,28.0,0,0,110.8833,1,0,0,0


In [17]:
X_train = train.drop(columns = ['survived'])
y_train = train.survived

X_validate = validate.drop(columns = ['survived'])
y_validate = validate.survived

X_test = test.drop(columns = ['survived'])
y_test = test.survived

In [18]:
X_train.shape, X_validate.shape, X_test.shape

((498, 9), (214, 9), (179, 9))

In [19]:
X_train.head()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
583,1,36.0,0,0,40.125,1,1,0,0
165,3,9.0,0,2,20.525,0,1,0,1
50,3,7.0,4,1,39.6875,0,1,0,1
259,2,50.0,0,1,26.0,0,0,0,1
306,1,28.0,0,0,110.8833,1,0,0,0


### Modeling

In [20]:
# Positive case: Did not survive
train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [23]:
# since most did not survive, use mode as baseline

baseline = y_train.mode()

#boolean array
baseline_prediction = y_train == 0

baseline_accuracy = round(baseline_prediction.mean(),2)
print(f'Baseline Accuracy: {baseline_accuracy}') 

Baseline Accuracy: 0.62


In [24]:
forest1 = RandomForestClassifier(max_depth = 1, random_state = 123)

# fit the model on train

forest1.fit(X_train, y_train)

# Use the model 
# We'll evaluate the model's performance on train and only train

y_predictions = forest1.predict(X_train)

#produce the classification report on the y values and this models predicted y values
report = classification_report(y_train, y_predictions, output_dict = True)
pd.DataFrame(report)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.773481,0.801471,0.781124,0.787476,0.784216
recall,0.912052,0.570681,0.781124,0.741366,0.781124
f1-score,0.83707,0.666667,0.781124,0.751868,0.771715
support,307.0,191.0,0.781124,498.0,498.0


In [30]:
#confusion matrix

labels = ['Actually Died', 'Actually Survived']
col_labels = ['Pred. Died', 'Pred. Survived']

pd.DataFrame(confusion_matrix(y_predictions , y_train), index = labels, columns = col_labels)

Unnamed: 0,Pred. Died,Pred. Survived
Actually Died,280,82
Actually Survived,27,109


In [36]:
#must put it as y_train, y_predictions to follow confusion matrix
#TN, FP, FN, TP = confusion_matrix(y_train, y_predictions).ravel()
ALL = TP + TN + FP + FN
TP, FP, FN, TN = confusion_matrix(y_train, y_predictions).ravel()

TP, TN, FP, FN

#TP = 280 (pred died, died)
#TN = 109 (pred survived, survived)
#FP = 27 (pred died, survived)
#FN = 82 (pred survived, died)

(280, 109, 27, 82)

In [37]:
accuracy = (TP + TN)/ ALL
print(f'Accuracy: {accuracy}')

true_positive_rate = TP / (TP + FN)
print(f'True Positive Rate: {true_positive_rate}')

false_positive_rate = FP / (FP + TN)
print(f'False Positive Rate: {false_positive_rate}')

precision = TP / (TP + FP)
print(f'Precision: {precision}')

recall = TP / (TP + FN)
print(f'Recall: {recall}')

f1_score = 2 * (precision * recall) / (precision + recall)
print(f'F1 Score: {f1_score}')

support_pos = TP + FN
print(f'Support (0): {support_pos}')

support_neg = FP + TN
print(f'Support (1): {support_neg}')

Accuracy: 0.7811244979919679
True Positive Rate: 0.7734806629834254
False Positive Rate: 0.19852941176470587
Precision: 0.9120521172638436
Recall: 0.7734806629834254
F1 Score: 0.8370702541106129
Support (0): 362
Support (1): 136


In [None]:
for i in range (2, 11):
    forest = RandomForestClassifier(max_depth = i, random_state = 123)