## Cross-validate and grid search an entire pipeline!

Preprocessing steps will automatically occur AFTER each cross-validation split, which is critical if you want meaningful scores.

In [1]:
import pandas as pd
df = pd.read_csv('Datasets/08_3_titanic_train.csv')

In [2]:
cols = ['Sex', 'Name']
X = df[cols]
y = df['Survived']

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import make_column_transformer

In [4]:
ohe = OneHotEncoder()
vect = CountVectorizer()
ct = make_column_transformer((ohe, ['Sex']), (vect, 'Name'))
# CountVectorizer expects 1-D input

In [5]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='liblinear', random_state=1)

In [6]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(ct, clf)

In [7]:
pipe

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('onehotencoder',
                                                  OneHotEncoder(), ['Sex']),
                                                 ('countvectorizer',
                                                  CountVectorizer(),
                                                  'Name')])),
                ('logisticregression',
                 LogisticRegression(random_state=1, solver='liblinear'))])

In [8]:
pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('onehotencoder', OneHotEncoder(), ['Sex']),
                                   ('countvectorizer', CountVectorizer(), 'Name')])),
  ('logisticregression',
   LogisticRegression(random_state=1, solver='liblinear'))],
 'verbose': False,
 'columntransformer': ColumnTransformer(transformers=[('onehotencoder', OneHotEncoder(), ['Sex']),
                                 ('countvectorizer', CountVectorizer(), 'Name')]),
 'logisticregression': LogisticRegression(random_state=1, solver='liblinear'),
 'columntransformer__n_jobs': None,
 'columntransformer__remainder': 'drop',
 'columntransformer__sparse_threshold': 0.3,
 'columntransformer__transformer_weights': None,
 'columntransformer__transformers': [('onehotencoder',
   OneHotEncoder(),
   ['Sex']),
  ('countvectorizer', CountVectorizer(), 'Name')],
 'columntransformer__verbose': False,
 'columntransformer__onehotencoder': OneHotEncoder(),
 'columntransfor

## Cross-validate the entire pipeline (not just the model)

In [9]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(pipe, X, y, cv=5, scoring='accuracy')
print(score)
print(score.mean())

[0.81564246 0.81460674 0.80337079 0.75280899 0.8258427 ]
0.8024543343167408


## Find optimal tuning parameters for the entire pipeline

In [10]:
# specify parameter values to search
params = {}
params['columntransformer__countvectorizer__min_df'] = [1, 2]
params['logisticregression__C'] = [0.1, 1, 10]
params['logisticregression__penalty'] = ['l1', 'l2']

In [11]:
params

{'columntransformer__countvectorizer__min_df': [1, 2],
 'logisticregression__C': [0.1, 1, 10],
 'logisticregression__penalty': ['l1', 'l2']}

In [12]:
# try all possible combinations of those parameter values
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X, y);

In [13]:
grid
# steps to be performed by the GridSearchCV

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('onehotencoder',
                                                                         OneHotEncoder(),
                                                                         ['Sex']),
                                                                        ('countvectorizer',
                                                                         CountVectorizer(),
                                                                         'Name')])),
                                       ('logisticregression',
                                        LogisticRegression(random_state=1,
                                                           solver='liblinear'))]),
             param_grid={'columntransformer__countvectorizer__min_df': [1, 2],
                         'logisticregression__C': [0.1, 1, 10],
                         

In [14]:
grid.get_params()

{'cv': 5,
 'error_score': nan,
 'estimator__memory': None,
 'estimator__steps': [('columntransformer',
   ColumnTransformer(transformers=[('onehotencoder', OneHotEncoder(), ['Sex']),
                                   ('countvectorizer', CountVectorizer(), 'Name')])),
  ('logisticregression',
   LogisticRegression(random_state=1, solver='liblinear'))],
 'estimator__verbose': False,
 'estimator__columntransformer': ColumnTransformer(transformers=[('onehotencoder', OneHotEncoder(), ['Sex']),
                                 ('countvectorizer', CountVectorizer(), 'Name')]),
 'estimator__logisticregression': LogisticRegression(random_state=1, solver='liblinear'),
 'estimator__columntransformer__n_jobs': None,
 'estimator__columntransformer__remainder': 'drop',
 'estimator__columntransformer__sparse_threshold': 0.3,
 'estimator__columntransformer__transformer_weights': None,
 'estimator__columntransformer__transformers': [('onehotencoder',
   OneHotEncoder(),
   ['Sex']),
  ('countvectorize

In [15]:
# what was the best score found during the search?
grid.best_score_

0.8215177954930638

In [16]:
# which combination of parameters produced the best score?
grid.best_params_

{'columntransformer__countvectorizer__min_df': 1,
 'logisticregression__C': 1,
 'logisticregression__penalty': 'l1'}