In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from acquire import get_titanic_data
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from adam_prepare import titanic_pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score

## 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [2]:
# TRAIN, VALIDATE, TEST DATA YOU ACQUIRED:

In [3]:
df = get_titanic_data()
train_test_split(df, train_size = 0.7, random_state = 42, stratify = df['survived'])

    

found data


[     passenger_id  survived  pclass     sex   age  sibsp  parch     fare  \
 748           748         0       1    male  19.0      1      0  53.1000   
 45             45         0       3    male   NaN      0      0   8.0500   
 28             28         1       3  female   NaN      0      0   7.8792   
 633           633         0       1    male   NaN      0      0   0.0000   
 403           403         0       3    male  28.0      1      0  15.8500   
 ..            ...       ...     ...     ...   ...    ...    ...      ...   
 476           476         0       2    male  34.0      1      0  21.0000   
 190           190         1       2  female  32.0      0      0  13.0000   
 736           736         0       3  female  48.0      1      3  34.3750   
 462           462         0       1    male  47.0      0      0  38.5000   
 136           136         1       1  female  19.0      0      2  26.2833   
 
     embarked   class deck  embark_town  alone  
 748        S   First    

In [4]:
# SPLIT DATA

# INPUT: 
'''DATAFRAME THAT WAS SPLIT (70% TRAIN, 15% VAL, 15% TEST) AND STRATIFIED USING TARGET VARIABLE FOR MAINTAINED 
DISTRIBUTION OF CLASSES AND GENERALIZATION PERFORMANCE ON NEW UNSEEN DATA '''


# OUTPUT: 
# train_test_split RETURNS SEPERATE pandas DATAFRAMES FOR EACH VARIABLE
# .shape RETURNS LIST OF TUPLES, WHERE EACH TUPLE REPRESENTS A MATRIX/ARRAY IN ROWS & COLUMNS. ALL COLUMNS SHOULD BE =

train, val, test = titanic_pipeline()
train.shape, val.shape, test.shape

found data


((623, 10), (134, 10), (134, 10))

______

In [5]:
# CREATE X & y FOR TRAIN & VALIDATE DATAFRAMES

# INPUT:
# X = ALL FEATURES/INDEPENDENT VARIABLES, WHERE IT REPRESENTS RELATIONAL FEATURES USED TO PREDICT GROUND TRUTH 
# y = TARGET/DEPENDENT VARIABLE, WHERE IT REPRESENTS GROUND TRUTH THAT THE MODEL LEARNS TO PREDICT

# PRUNE THE X_TRAIN AND X_VALIDATE DATAFRAMES AND EXTRACT TARGET VARIABLE
# YOU DROP THE TARGET VARIABLE AND ANY FEATURES NOT SIGNIFICANT USING pandas .drop() method

# OUTPUT:
# X RETURNS NEW pandas DATAFRAME WITHOUT SPECIFIED FEATURES
# Y RETURNS NEW pandas SERIES WITH INDICES FOR ALL DATAPOINTS FROM TARGET VARIABLE

X_train = train.drop(columns = ['survived', 'deck'])
y_train = train.survived

X_val = val.drop(columns = ['survived', 'deck'])
y_val = val.survived

y_train

748    0
45     0
28     1
633    0
403    0
      ..
476    0
190    1
736    0
462    0
136    1
Name: survived, Length: 623, dtype: int64

------

In [6]:
# PERFORM ONE HOT ENCODING ON X DATAFRAME USING pd.get_dummies WHERE CATEGORICAL DATA IS CONVERTED INTO BINARY MATRIX (0,1)

# INPUT:
# CATEGORICAL COLUMNS FROM YOUR X DATAFRAME
# USE drop_first=TRUE TO REDUCE MILTICOLLINEARITY WHERE YOU DROP THE FIRST CATEGORY LEVEL FOR EACH CATIGORICAL FEATURE

# OUPUT:
# NEW X DATAFRAME. COLUMNS SHOULD STILL BE =

X_train = pd.get_dummies(X_train, columns = ['sex'], drop_first = True)
X_train = pd.get_dummies(X_train, columns = ['class', 'embark_town'])

X_val = pd.get_dummies(X_val, columns = ['sex'], drop_first = True)
X_val = pd.get_dummies(X_val, columns = ['class', 'embark_town'])



In [7]:
X_train.shape, X_val.shape

((623, 12), (134, 12))

---

In [8]:
X_train.head()

Unnamed: 0,age,sibsp,parch,fare,alone,sex_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
748,19.0,1,0,53.1,0,1,1,0,0,0,0,1
45,29.0,0,0,8.05,1,1,0,0,1,0,0,1
28,29.0,0,0,7.8792,1,0,0,0,1,0,1,0
633,29.0,0,0,0.0,1,1,1,0,0,0,0,1
403,28.0,1,0,15.85,0,1,0,0,1,0,0,1


---

In [9]:
# CREATE OBJECT AND TRAIN MODEL
# ADJUST HYPERPARAMETERS THAT BEST FIT MODEL

seed = 42

rf = RandomForestClassifier(min_samples_leaf = 1, max_depth = 10, random_state = seed)

---

In [10]:
# WHAT LIBRARY IS THIS METHOD FROM?
# USE .fit() METHOD TO AUTOMATICALLY OPTIMIZE INTERNAL PARAMETERS OF THE MODEL USING THE TRAINING DATA

# INPUT: 
# X & Y TRAIN DATAFRAMES/ALL RELATIONAL FEATURES & TARGET VARIABLE

# OUTPUT: 
# RANDOMFORESTCLASSIFIER FROM scikit-learn READY TO MAKE PREDICTIONS 

rf.fit(X_train, y_train)

---

In [11]:
# EVALUATE IMPORTANCE/WEIGHT OF EACH FEATURE USING feature_important_ METHOD

fi = pd.DataFrame({'feature': X_train.columns,
                  'importance': rf.feature_importances_})

fi.sort_values(by = 'importance', ascending = False)

Unnamed: 0,feature,importance
5,sex_male,0.304415
3,fare,0.227986
0,age,0.208397
8,class_Third,0.059482
2,parch,0.044518
1,sibsp,0.042962
6,class_First,0.033793
4,alone,0.021228
11,embark_town_Southampton,0.018501
7,class_Second,0.015583


---

In [12]:
# MAKE PREDICTIONS BASED OF RELATIONSHIPS YOUR MODEL LEARNED DURING TRAINING TO NEW DATA

# INPUT: 
# DATAFRAME WITH SAME STRUCTURE AND FORMAT AS THE DATA THE MODEL WAS TRAINED ON

# OUTPUT:
# ARRAY OF OUTCOMES (1,0)

y_pred = rf.predict(X_train)

y_pred[:5]

array([0, 0, 1, 0, 0])

---

In [13]:
# ESTIMATE THE PROBABILITY OF EACH PREDICTION

y_pred_proba = rf.predict_proba(X_train)

y_pred_proba

array([[0.63485783, 0.36514217],
       [0.92536919, 0.07463081],
       [0.05132852, 0.94867148],
       ...,
       [0.94831313, 0.05168687],
       [0.9052349 , 0.0947651 ],
       [0.01368421, 0.98631579]])

## 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [24]:
# COMPUTE THE ACCURACY OF YOUR MODEL USING .score() method

rf.score(X_train, y_train)

0.9550561797752809

---

In [19]:
print(confusion_matrix(y_train, y_pred))

[[380   4]
 [ 24 215]]


In [20]:
pd.crosstab(y_train, y_pred)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,380,4
1,24,215


---

In [22]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.99      0.96       384
           1       0.98      0.90      0.94       239

    accuracy                           0.96       623
   macro avg       0.96      0.94      0.95       623
weighted avg       0.96      0.96      0.95       623



---

In [27]:
# EVALUATE ON OUT OF SAMPLE DATA

y_pred = rf.predict(X_val)

y_pred[:5]

array([0, 0, 0, 0, 0])

In [29]:
y_pred_proba = rf.predict_proba(X_val)

y_pred_proba[:5]

array([[0.53275   , 0.46725   ],
       [0.95798034, 0.04201966],
       [0.89504141, 0.10495859],
       [0.92256798, 0.07743202],
       [0.78294274, 0.21705726]])

In [31]:
print(confusion_matrix(y_val, y_pred))

[[74  8]
 [13 39]]


In [37]:
pd.crosstab(y_val, y_pred)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,74,8
1,13,39


In [43]:
a = rf.score(X_val, y_val)
a

0.8432835820895522

In [62]:
p = precision_score(y_val, y_pred)
p

0.8297872340425532

In [63]:
r = recall_score(y_val, y_pred)
r

0.75

In [30]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.90      0.88        82
           1       0.83      0.75      0.79        52

    accuracy                           0.84       134
   macro avg       0.84      0.83      0.83       134
weighted avg       0.84      0.84      0.84       134



## 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [70]:
print(pd.DataFrame({'Accuracy': a,
                   'TP': 39/134,
                   'FP': 8/134,
                   'TN': 74/134,
                   'FN': 13/134,
                   'Precision': p,
                   'Recall': r,
                   'F1-score': .79,
                   'Suupport': 52}, index = [0]))

   Accuracy        TP        FP        TN        FN  Precision  Recall  \
0  0.843284  0.291045  0.059701  0.552239  0.097015   0.829787    0.75   

   F1-score  Suupport  
0      0.79        52  


## 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

## 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

## After making a few models, which one has the best performance (or closest metrics) on both train and validate?