In [207]:
##### Pipelines and Column Transformers

In [208]:
# The overall steps we followed in Exercise 6a are:

# 0) we transformed Cabin to Deck

# 1) for numerical column, Age:
#  - we first impute missing values with mean
#  - we then use k-bins discretizer on it

# 2) for categorical columns, Deck, Pclass, and Sex:
#  - we first impute missing values with constant 'X'
#  - we then use one-hot-encoding on them

# 3) then, in cases where we were going to use the transformed columns we had to drop the original columns 

# 4) then we take the transformed data and build the model

# Note that in doing so, we had to keep track of all the transformations we were doing in steps 1-3
#  - we had to do all the transformations with "fit_transform" on training data,
#    (including remembering to drop the columns we didn't want to use anymore)
#  - and then repeat all of the transformations with "transform" on test data

# We also had to keep track of  building the model in step 4:
#  - we had to fit the model on the transformed training data,
#  - and then predict using the tranformed test data

# We can use ColumnTransformers and Pipelines to make life easier for us!

In [209]:
# https://scikit-learn.org/stable/modules/compose.html#pipeline
#
# Pipeline can be used to chain multiple estimators into one. 
# This is useful as there is often a fixed sequence of steps in processing the data, 
# for example feature selection, normalization and classification. 
#
# Pipeline serves multiple purposes here:
# - Convenience and encapsulation
#    You only have to call fit and predict once on your data to fit a whole sequence of estimators.
# - Joint parameter selection
#   You can grid search over parameters of all estimators in the pipeline at once.
# - Safety
#   Pipelines help avoid leaking statistics from your test data into the trained model in cross-validation, 
#   by ensuring that the same samples are used to train the transformers and predictors.
#
# All estimators in a pipeline, except the last one, must be transformers (i.e. must have a transform method). 
# The last estimator may be any type (transformer, classifier, etc.).
#
# The Pipeline is built using a list of (key, value) pairs, 
# where the key is a string containing the name you want to give this step and value is an estimator object.

In [210]:
# https://scikit-learn.org/stable/modules/compose.html#column-transformer
# 
# Warning The compose.ColumnTransformer class is experimental and the API is subject to change.
#
# Many datasets contain features of different types, say text, floats, and dates, where each type of feature requires 
# separate preprocessing or feature extraction steps. Often it is easiest to preprocess data before applying scikit-learn 
# methods, for example using pandas. Processing your data before passing it to scikit-learn might be problematic for 
# one of the following reasons:
# - Incorporating statistics from test data into the preprocessors makes cross-validation scores unreliable 
#   (known as data leakage), for example in the case of scalers or imputing missing values.
# - You may want to include the parameters of the preprocessors in a parameter search.
#
# The ColumnTransformer helps performing different transformations for different columns of the data, 
# within a Pipeline that is safe from data leakage and that can be parametrized. ColumnTransformer works on arrays, 
# sparse matrices, and pandas DataFrames.
#
# To each column, a different transformation can be applied, such as preprocessing or a specific feature extraction method.

In [211]:
# code adapted from: 
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

In [212]:
import numpy as np
import pandas as pd

In [213]:
# read data
df = pd.read_csv('data/kaggleTitanic/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [214]:
# transform 'Cabin' column to 'Deck'
df['Deck'] = df['Cabin'].apply(lambda x: x[0] if pd.notna(x) else np.nan)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,


In [215]:
# set up data
X = df.drop(['Survived'], axis=1)
y = df['Survived']
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

### Following code to deal with SetttingWithCopyWarning, and ensure we are working with a copy of the data and not a view
#https://github.com/scikit-learn/scikit-learn/issues/8723#issuecomment-416513938
#http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#evaluation-order-matters
Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

In [216]:
# set up preprocessing pipeline for numeric data
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
numeric_features = ['Age']
numeric_transformer = Pipeline(steps=[
    ('si', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('kbd', KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile'))])
#numeric_transformer.fit_transform(df[numeric_features])

In [217]:
# set up preprocessing pipeline for categorical data
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
categorical_features = ['Pclass', 'Sex', 'Deck']
categorical_transformer = Pipeline(steps=[
    ('si', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='X')),
    ('ohe', OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore'))])
#categorical_transformer.fit_transform(df[categorical_features])

In [218]:
# transform data as per preprocessing pipelines for numeric and categorical data
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder='drop') # remainder='passthrough')

# By default, only the specified columns in transformers are transformed and combined in the output, 
# and the non-specified columns are dropped. (default of 'drop'). By specifying remainder='passthrough', 
# all remaining columns that were not specified in transformers will be automatically passed through. 
# This subset of columns is concatenated with the output of the transformers. 

In [219]:
# build the preprocessing->model pipeline
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier  
clf = Pipeline(steps=[('pp', preprocessor),
                      ('rfc', RandomForestClassifier(n_estimators=100, random_state=1))])

In [220]:
# fit model to training data
clf.fit(Xtrain, ytrain)

Pipeline(memory=None,
     steps=[('pp', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('si', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)), ('kbd',...mators=100, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False))])

In [221]:
# evaluate on test data
ypred = clf.predict(Xtest)
from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.7541899441340782
[[87 19]
 [25 48]]
              precision    recall  f1-score   support

           0       0.78      0.82      0.80       106
           1       0.72      0.66      0.69        73

   micro avg       0.75      0.75      0.75       179
   macro avg       0.75      0.74      0.74       179
weighted avg       0.75      0.75      0.75       179

