In [134]:
##### Grid Search

In [135]:
# https://scikit-learn.org/stable/modules/grid_search.html
#
# Hyper-parameters are parameters that are not directly learnt within estimators. In scikit-learn they are passed as 
# arguments to the constructor of the estimator classes. Typical examples include C, kernel and gamma for Support Vector 
# Classifier, alpha for Lasso, etc.
#
# It is possible and recommended to search the hyper-parameter space for the best cross validation score.
# 
# Any parameter provided when constructing an estimator may be optimized in this manner. 
#
# A search consists of:
# - an estimator (regressor or classifier such as sklearn.svm.SVC());
# - a parameter space;
# - a method for searching or sampling candidates;
# - a cross-validation scheme; and
# - a score function.
#
# Some models allow for specialized, efficient parameter search strategies, outlined below. 
# Two generic approaches to sampling search candidates are provided in scikit-learn: for given values, GridSearchCV 
# exhaustively considers all parameter combinations, while RandomizedSearchCV can sample a given number of candidates 
# from a parameter space with a specified distribution. 
#

In [3]:
import numpy as np
import pandas as pd

In [4]:
# read data
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# transform 'Cabin' column to 'Deck'
df['Deck'] = df['Cabin'].apply(lambda x: x[0] if pd.notna(x) else np.nan)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,


In [6]:
# set up data
X = df.drop(['Survived'], axis=1)
y = df['Survived']
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

### Following code to deal with SetttingWithCopyWarning, and ensure we are working with a copy of the data and not a view
#https://github.com/scikit-learn/scikit-learn/issues/8723#issuecomment-416513938
#http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#evaluation-order-matters
Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

In [7]:
# set up preprocessing pipeline for numeric data
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
numeric_features = ['Age']
numeric_transformer = Pipeline(steps=[
    ('si', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('kbd', KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile'))])
#numeric_transformer.fit_transform(df[numeric_features])

In [8]:
# set up preprocessing pipeline for categorical data
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
categorical_features = ['Pclass', 'Sex', 'Deck']
categorical_transformer = Pipeline(steps=[
    ('si', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='X')),
    ('ohe', OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore'))])
#categorical_transformer.fit_transform(df[categorical_features])

In [9]:
# setup transformation of data as per preprocessing pipelines for numeric and categorical data
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder='drop') # remainder='passthrough')

# By default, only the specified columns in transformers are transformed and combined in the output, 
# and the non-specified columns are dropped. (default of 'drop'). By specifying remainder='passthrough', 
# all remaining columns that were not specified in transformers will be automatically passed through. 
# This subset of columns is concatenated with the output of the transformers. 

In [10]:
# setup the preprocessing->model pipeline
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier  
clf = Pipeline(steps=[('pp', preprocessor),
                      ('rfc', RandomForestClassifier(random_state=1))])

In [11]:
# setup grid search
from sklearn.model_selection import GridSearchCV
param_grid = {
    'rfc__n_estimators': [100, 200, 300, 400, 500, 1000],
    'rfc__criterion': ['gini', 'entropy']
}
gscv = GridSearchCV(clf, param_grid, iid=False, cv=5, return_train_score=False)

In [12]:
# search for best params
gscv.fit(Xtrain, ytrain)
print(gscv.best_estimator_, "\n")
print(gscv.best_score_, "\n")
print(gscv.best_params_, "\n")
print(gscv.cv_results_, "\n")

Pipeline(memory=None,
     steps=[('pp', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('si', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)), ('kbd',...mators=100, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False))]) 

0.8076295794905324 

{'rfc__criterion': 'gini', 'rfc__n_estimators': 100} 

{'mean_fit_time': array([0.07170849, 0.11341143, 0.16682329, 0.22422237, 0.27260556,
       0.53810372, 0.06021919, 0.11878676, 0.16824765, 0.22395883,
       0.28180661, 0.54904761]), 'std_fit_time': array([0.02555379, 0.0007662 , 0.00102855, 0.00771929, 0.00136966,
       0.00193472, 0.00082581, 0.00388516, 0.00067053, 0.00137615,
       0.00808521, 0.00375451]), 'mean_score_time': array([0.00817571, 0.01339154, 0.0187736 , 0.02360744, 0.02819266,
       0.05405293, 0.00806823, 0.013

In [13]:
# evaluate best_estimator_ on test data
ypred = gscv.best_estimator_.predict(Xtest)
from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.7541899441340782
[[87 19]
 [25 48]]
              precision    recall  f1-score   support

           0       0.78      0.82      0.80       106
           1       0.72      0.66      0.69        73

   micro avg       0.75      0.75      0.75       179
   macro avg       0.75      0.74      0.74       179
weighted avg       0.75      0.75      0.75       179

