## Exercises

Continue working in your `model` file with titanic data to do the following: 

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

2. Evaluate your results using the model score, confusion matrix, and classification report.

3. Print and clearly label the following:  Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

4. Run through steps increasing your min_samples_leaf and decreasing your max_depth. 

5. What are the differences in the evaluation metrics?  Which performs better on your in-sample data?  Why?

After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings("ignore")

import acquire2
import prepare

## Acquire

In [2]:
df = acquire2.get_titanic_data()
df.head()

Using cached csv


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
df.survived.mean()

0.3838383838383838

### Prepare

In [4]:
#set passenger_id as index
df = df.set_index("passenger_id")

In [5]:
#check for nulls
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
deck           688
embark_town      2
alone            0
dtype: int64

In [6]:
#drop duplicate columns & columns w/ too many null values
df = df.drop(columns = ['class', 'embarked', 'deck'])

In [7]:
#fill null values in embark_town w/ mode

df.embark_town = df.embark_town.fillna(value = df.embark_town.mode())

In [8]:
#fill null age values w/ median age

df.age = df.age.fillna(value = df.age.median())

### Encode

In [9]:
dummy_df = pd.get_dummies(df[['sex', 'embark_town']], dummy_na = False, drop_first = [True, True])

#drop original columns that are being encoded
df = df.drop(columns = ['sex', 'embark_town'])

#stitch back together again
df = pd.concat([df, dummy_df], axis = 1)
df.head()

Unnamed: 0_level_0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,3,22.0,1,0,7.25,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0,0
2,1,3,26.0,0,0,7.925,1,0,0,1
3,1,1,35.0,1,0,53.1,0,0,0,1
4,0,3,35.0,0,0,8.05,1,1,0,1


In [11]:
#split the data

train , test = train_test_split(df, test_size = .2, random_state = 123, stratify = df.survived)
train, validate = train_test_split(train, test_size = .3, random_state = 123, stratify = train.survived)

In [12]:
train.shape

(498, 10)

In [13]:
validate.shape

(214, 10)

In [14]:
test.shape

(179, 10)

In [15]:
train.head()

Unnamed: 0_level_0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
583,0,1,36.0,0,0,40.125,1,1,0,0
165,1,3,9.0,0,2,20.525,0,1,0,1
50,0,3,7.0,4,1,39.6875,0,1,0,1
259,1,2,50.0,0,1,26.0,0,0,0,1
306,1,1,28.0,0,0,110.8833,1,0,0,0


In [17]:
X_train = train.drop(columns = ['survived'])
y_train = train.survived

X_validate = validate.drop(columns = ['survived'])
y_validate = validate.survived

X_test = test.drop(columns = ['survived'])
y_test = test.survived

In [18]:
X_train.shape, X_validate.shape, X_test.shape

((498, 9), (214, 9), (179, 9))

In [19]:
X_train.head()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
583,1,36.0,0,0,40.125,1,1,0,0
165,3,9.0,0,2,20.525,0,1,0,1
50,3,7.0,4,1,39.6875,0,1,0,1
259,2,50.0,0,1,26.0,0,0,0,1
306,1,28.0,0,0,110.8833,1,0,0,0


### Modeling

In [20]:
# Positive case: Did not survive
train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [23]:
# since most did not survive, use mode as baseline

baseline = y_train.mode()

#boolean array
baseline_prediction = y_train == 0

baseline_accuracy = round(baseline_prediction.mean(),2)
print(f'Baseline Accuracy: {baseline_accuracy}') 

Baseline Accuracy: 0.62


In [24]:
forest1 = RandomForestClassifier(max_depth = 1, random_state = 123)

# fit the model on train

forest1.fit(X_train, y_train)

# Use the model 
# We'll evaluate the model's performance on train and only train

y_predictions = forest1.predict(X_train)

#produce the classification report on the y values and this models predicted y values
report = classification_report(y_train, y_predictions, output_dict = True)
pd.DataFrame(report)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.773481,0.801471,0.781124,0.787476,0.784216
recall,0.912052,0.570681,0.781124,0.741366,0.781124
f1-score,0.83707,0.666667,0.781124,0.751868,0.771715
support,307.0,191.0,0.781124,498.0,498.0


In [30]:
#confusion matrix

labels = ['Actually Died', 'Actually Survived']
col_labels = ['Pred. Died', 'Pred. Survived']

pd.DataFrame(confusion_matrix(y_predictions , y_train), index = labels, columns = col_labels)

Unnamed: 0,Pred. Died,Pred. Survived
Actually Died,280,82
Actually Survived,27,109


In [36]:
#must put it as y_train, y_predictions to follow confusion matrix
#TN, FP, FN, TP = confusion_matrix(y_train, y_predictions).ravel()
ALL = TP + TN + FP + FN
TP, FP, FN, TN = confusion_matrix(y_train, y_predictions).ravel()

TP, TN, FP, FN

#TP = 280 (pred died, died)
#TN = 109 (pred survived, survived)
#FP = 27 (pred died, survived)
#FN = 82 (pred survived, died)

(280, 109, 27, 82)

In [37]:
accuracy = (TP + TN)/ ALL
print(f'Accuracy: {accuracy}')

true_positive_rate = TP / (TP + FN)
print(f'True Positive Rate: {true_positive_rate}')

false_positive_rate = FP / (FP + TN)
print(f'False Positive Rate: {false_positive_rate}')

precision = TP / (TP + FP)
print(f'Precision: {precision}')

recall = TP / (TP + FN)
print(f'Recall: {recall}')

f1_score = 2 * (precision * recall) / (precision + recall)
print(f'F1 Score: {f1_score}')

support_pos = TP + FN
print(f'Support (0): {support_pos}')

support_neg = FP + TN
print(f'Support (1): {support_neg}')

Accuracy: 0.7811244979919679
True Positive Rate: 0.7734806629834254
False Positive Rate: 0.19852941176470587
Precision: 0.9120521172638436
Recall: 0.7734806629834254
F1 Score: 0.8370702541106129
Support (0): 362
Support (1): 136


# Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [51]:
forest10 = RandomForestClassifier(max_depth = 10, min_samples_leaf = 1, random_state = 123)

# fit the model on train

forest10.fit(X_train, y_train)

# Use the model 
# We'll evaluate the model's performance on train and only train

y_predictions = forest10.predict(X_train)

#produce the classification report on the y values and this models predicted y values
report = classification_report(y_train, y_predictions, output_dict = True)
pd.DataFrame(report)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.953416,1.0,0.96988,0.976708,0.971283
recall,1.0,0.921466,0.96988,0.960733,0.96988
f1-score,0.976153,0.959128,0.96988,0.96764,0.969623
support,307.0,191.0,0.96988,498.0,498.0


# Evaluate your results using the model score, confusion matrix, and classification report.

In [52]:
#confusion matrix

labels = ['Actually Died', 'Actually Survived']
col_labels = ['Pred. Died', 'Pred. Survived']

pd.DataFrame(confusion_matrix(y_predictions , y_train), index = labels, columns = col_labels)

Unnamed: 0,Pred. Died,Pred. Survived
Actually Died,307,15
Actually Survived,0,176


# Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [53]:
#must put it as y_train, y_predictions to follow confusion matrix
#TN, FP, FN, TP = confusion_matrix(y_train, y_predictions).ravel()
ALL = TP + TN + FP + FN
TP, FP, FN, TN = confusion_matrix(y_train, y_predictions).ravel()

TP, TN, FP, FN

#TP =  307 (pred died, died)
#TN = 176 (pred survived, survived)
#FP = 0 (pred died, survived)
#FN = 15 (pred survived, died)

(307, 176, 0, 15)

In [54]:
accuracy = (TP + TN)/ ALL
print(f'Accuracy: {accuracy}')

true_positive_rate = TP / (TP + FN)
print(f'True Positive Rate: {true_positive_rate}')

false_positive_rate = FP / (FP + TN)
print(f'False Positive Rate: {false_positive_rate}')

precision = TP / (TP + FP)
print(f'Precision: {precision}')

recall = TP / (TP + FN)
print(f'Recall: {recall}')

f1_score = 2 * (precision * recall) / (precision + recall)
print(f'F1 Score: {f1_score}')

support_pos = TP + FN
print(f'Support (0): {support_pos}')

support_neg = FP + TN
print(f'Support (1): {support_neg}')

Accuracy: 0.9698795180722891
True Positive Rate: 0.953416149068323
False Positive Rate: 0.0
Precision: 1.0
Recall: 0.953416149068323
F1 Score: 0.9761526232114468
Support (0): 322
Support (1): 176


### Loop to determine different max_depth outcomes

In [40]:
for i in range (2, 11):
    # make rf model
    forest = RandomForestClassifier(max_depth = i, random_state = 123)
    
    # fit rf model (on train)
    forest = forest.fit(X_train, y_train)
    
    #use the model to eval. predictions
    y_predictions = forest.predict(X_train)
    
    #classification report
    report = classification_report(y_train, y_predictions, output_dict = True)
    print(f'Tree with max depth of {i}')
    print(pd.DataFrame(report))
    print()

Tree with max depth of 2
                    0           1  accuracy   macro avg  weighted avg
precision    0.815029    0.835526  0.821285    0.825278      0.822890
recall       0.918567    0.664921  0.821285    0.791744      0.821285
f1-score     0.863706    0.740525  0.821285    0.802115      0.816462
support    307.000000  191.000000  0.821285  498.000000    498.000000

Tree with max depth of 3
                    0           1  accuracy   macro avg  weighted avg
precision    0.836257    0.865385  0.845382    0.850821      0.847429
recall       0.931596    0.706806  0.845382    0.819201      0.845382
f1-score     0.881356    0.778098  0.845382    0.829727      0.841753
support    307.000000  191.000000  0.845382  498.000000    498.000000

Tree with max depth of 4
                    0           1  accuracy   macro avg  weighted avg
precision    0.842566    0.883871  0.855422    0.863218      0.858408
recall       0.941368    0.717277  0.855422    0.829323      0.855422
f1-score     

In [42]:
metrics = []

for i in range(2, 25):
    #make model
    forest = RandomForestClassifier(max_depth = i, random_state = 123)
    
    #fit model on train
    forest = forest.fit(X_train, y_train)
    
    #use model to evaluate models performance on train, first
    in_sample_accuracy = forest.score(X_train, y_train)
    
    out_of_sample_accuracy = forest.score(X_validate, y_validate)
    
    output = {
        'max_depth': i,
        'train_accuracy': in_sample_accuracy,
        'validate_accuracy': out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df['difference'] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,max_depth,train_accuracy,validate_accuracy,difference
0,2,0.821285,0.771028,0.050257
1,3,0.845382,0.794393,0.050989
2,4,0.855422,0.799065,0.056356
3,5,0.865462,0.803738,0.061724
4,6,0.891566,0.813084,0.078482
5,7,0.927711,0.82243,0.105281
6,8,0.945783,0.808411,0.137372
7,9,0.957831,0.803738,0.154093
8,10,0.96988,0.813084,0.156795
9,11,0.977912,0.808411,0.1695


### Increasing min_samples_per_leaf, decreasing max_depth

In [44]:
metrics = []
max_depth = 20

for i in range( 2, max_depth):
    
    #make the model
    depth = max_depth - i
    n_samples = i
    forest = RandomForestClassifier(max_depth = depth, min_samples_leaf = n_samples, random_state = 123)
    
    #fit the model (on train only)
    
    forest = forest.fit(X_train, y_train)
    
    #Use the model to evaluate performance
    in_sample_accuracy = forest.score(X_train, y_train)
    
    out_of_sample_accuracy = forest.score(X_validate, y_validate)
    
    output = {
        'min_samples_per_leaf': n_samples,
        'max_depth': depth,
        'train_accuracy': in_sample_accuracy,
        'validate_accuracy': out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df['difference'] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,min_samples_per_leaf,max_depth,train_accuracy,validate_accuracy,difference
0,2,18,0.925703,0.82243,0.103273
1,3,17,0.901606,0.817757,0.083849
2,4,16,0.88755,0.817757,0.069793
3,5,15,0.87751,0.780374,0.097136
4,6,14,0.871486,0.799065,0.072421
5,7,13,0.869478,0.78972,0.079758
6,8,12,0.865462,0.794393,0.071069
7,9,11,0.85743,0.794393,0.063037
8,10,10,0.85743,0.785047,0.072383
9,11,9,0.849398,0.785047,0.064351


### Increase min_samples_per_leaf and increase max_depth

In [48]:
metrics = []
max_depth = 20

for i in range(2, max_depth):
    depth = i
    n_samples = i
    forest = RandomForestClassifier(max_depth = depth, min_samples_leaf = n_samples, random_state = 123)
    
    #fit model on train only
    
    forest = forest.fit (X_train, y_train)
    
    #use model and evaluate performance
    in_sample_accuracy = forest.score(X_train, y_train)
    
    out_of_sample_accuracy = forest.score(X_validate, y_validate)
    
    output = {
        'min_samples_per_leaf': n_samples,
        'max_depth': depth,
        'train_accuracy':in_sample_accuracy,
        'validate_accuracy': out_of_sample_accuracy
    }
    
    metrics.append(output)

df = pd.DataFrame(metrics)
df['difference'] = df.train_accuracy - df.validate_accuracy
df    

Unnamed: 0,min_samples_per_leaf,max_depth,train_accuracy,validate_accuracy,difference
0,2,2,0.821285,0.771028,0.050257
1,3,3,0.845382,0.785047,0.060335
2,4,4,0.84739,0.794393,0.052997
3,5,5,0.859438,0.799065,0.060372
4,6,6,0.861446,0.799065,0.06238
5,7,7,0.863454,0.78972,0.073734
6,8,8,0.863454,0.78972,0.073734
7,9,9,0.855422,0.794393,0.061029
8,10,10,0.85743,0.785047,0.072383
9,11,11,0.849398,0.785047,0.064351


### Fixed depth and increasing min_samples_leaf

In [50]:
metrics = []

for i in range(2, 50):
    
    #make model
    depth = 5
    n_samples = i
    forest = RandomForestClassifier(max_depth = depth, min_samples_leaf = n_samples, random_state = 123)
    
    #fit model on train only
    forest = forest.fit(X_train, y_train)
    
    #use model and evaluate performance
    in_sample_accuracy = forest.score(X_train, y_train)
    
    out_of_sample_accuracy = forest.score(X_validate, y_validate)
    
    output = {
        'min_samples_per_leaf': n_samples,
        'max_depth': depth,
        'train_accuracy': in_sample_accuracy,
        'validate_accuracy': out_of_sample_accuracy

    }

    metrics.append(output)
    
df = pd.DataFrame(metrics)
df['difference'] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,min_samples_per_leaf,max_depth,train_accuracy,validate_accuracy,difference
0,2,5,0.861446,0.794393,0.067053
1,3,5,0.859438,0.78972,0.069718
2,4,5,0.85743,0.78972,0.06771
3,5,5,0.859438,0.799065,0.060372
4,6,5,0.859438,0.78972,0.069718
5,7,5,0.851406,0.785047,0.066359
6,8,5,0.855422,0.78972,0.065702
7,9,5,0.845382,0.785047,0.060335
8,10,5,0.84739,0.780374,0.067016
9,11,5,0.839357,0.780374,0.058984
