## Additional Learning Resources
Refer to [scikit-learn documentation](https://scikit-learn.org/stable/) and the [Pandas user guide](https://pandas.pydata.org/docs/) for detailed explanations of the functions used in this notebook.
For a quick refresher on splitting data:
```python
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
```


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

In [2]:
data = pd.read_csv("train.csv")

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_df = data[['Age', 'SibSp', 'Parch', 'Fare', 'Survived']]

train_df.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Survived
0,22.0,1,0,7.25,0
1,38.0,1,0,71.2833,1
2,26.0,0,0,7.925,1
3,35.0,1,0,53.1,1
4,35.0,0,0,8.05,0


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   SibSp     891 non-null    int64  
 2   Parch     891 non-null    int64  
 3   Fare      891 non-null    float64
 4   Survived  891 non-null    int64  
dtypes: float64(2), int64(3)
memory usage: 34.9 KB


In [6]:
X_train = train_df.drop('Survived', axis=1)
y_train = train_df['Survived']

In [9]:
pipeline = Pipeline([('imputer', SimpleImputer()), 
                     ('scaler', StandardScaler()),
                     ('clf', RandomForestClassifier())])

In [10]:
pipeline.named_steps

{'imputer': SimpleImputer(),
 'scaler': StandardScaler(),
 'clf': RandomForestClassifier()}

In [11]:
hyperparam_grid = {
    'imputer__strategy': ['mean', 'median'], 
    'clf__n_estimators': [5, 10, 100, 200],
    'clf__min_samples_leaf': [5, 10, 20]
}


In [12]:
gs = GridSearchCV(estimator=pipeline, 
                  param_grid=hyperparam_grid,                    
                  cv=5,                                         
                  scoring='f1')

In [13]:
grid_search = gs.fit(X_train, y_train)

In [14]:
pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__min_samples_leaf,param_clf__n_estimators,param_imputer__strategy,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,0.005984,4.6e-05,0.00151,8e-06,10,5,mean,"{'clf__min_samples_leaf': 10, 'clf__n_estimato...",0.431034,0.571429,0.561404,0.676923,0.633333,0.574825,0.083324,1
5,0.085594,0.000251,0.006661,2.9e-05,5,100,median,"{'clf__min_samples_leaf': 5, 'clf__n_estimator...",0.385965,0.582677,0.596491,0.655462,0.640625,0.572244,0.096951,2
14,0.159216,0.000427,0.01156,2.7e-05,10,200,mean,"{'clf__min_samples_leaf': 10, 'clf__n_estimato...",0.396396,0.576271,0.578947,0.672131,0.627119,0.570173,0.093756,3
6,0.168934,0.000877,0.012096,0.000138,5,200,mean,"{'clf__min_samples_leaf': 5, 'clf__n_estimator...",0.382609,0.582677,0.615385,0.644628,0.624,0.56986,0.09573,4
7,0.167678,0.000758,0.011964,4.4e-05,5,200,median,"{'clf__min_samples_leaf': 5, 'clf__n_estimator...",0.382609,0.578125,0.608696,0.661157,0.617886,0.569695,0.097243,5
2,0.010598,6.3e-05,0.001838,6e-06,5,10,mean,"{'clf__min_samples_leaf': 5, 'clf__n_estimator...",0.424779,0.583333,0.578947,0.66129,0.589147,0.567499,0.077481,6
18,0.009553,6.2e-05,0.001744,6e-06,20,10,mean,"{'clf__min_samples_leaf': 20, 'clf__n_estimato...",0.414414,0.564103,0.54717,0.666667,0.630631,0.564597,0.086761,7
4,0.085988,0.000297,0.006797,0.000156,5,100,mean,"{'clf__min_samples_leaf': 5, 'clf__n_estimator...",0.375,0.569231,0.591304,0.661157,0.617886,0.562916,0.098824,8
15,0.159172,0.000296,0.011582,4.5e-05,10,200,median,"{'clf__min_samples_leaf': 10, 'clf__n_estimato...",0.4,0.576271,0.571429,0.666667,0.581197,0.559113,0.08697,9
0,0.012974,0.005603,0.002851,0.001019,5,5,mean,"{'clf__min_samples_leaf': 5, 'clf__n_estimator...",0.442623,0.614173,0.596774,0.566667,0.571429,0.558333,0.060375,10


In [15]:
best_pipe = gs.best_estimator_

In [16]:
best_pipe

Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('clf',
                 RandomForestClassifier(min_samples_leaf=10, n_estimators=5))])

In [17]:
best_pipe.predict(X_train)

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0,