## scikit-learn tip #16

You can cross-validate and grid search an entire pipeline!

Preprocessing steps will automatically occur AFTER each cross-validation split, which is critical if you want meaningful scores.

In [1]:
import pandas as pd
df = pd.read_csv('http://bit.ly/kaggletrain')

In [2]:
cols = ['Sex', 'Name']
X = df[cols]
y = df['Survived']

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import make_column_transformer

In [4]:
ohe = OneHotEncoder()
vect = CountVectorizer()
ct = make_column_transformer((ohe, ['Sex']), (vect, 'Name'))

In [5]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='liblinear', random_state=1)

In [6]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(ct, clf)

In [7]:
# Cross-validate the entire pipeline (not just the model)
from sklearn.model_selection import  cross_val_score
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

0.8024543343167408

### Find optimal tuning parameters for the entire pipeline

In [12]:
# specify parameter values to search
params = {}
params['columntransformer__countvectorizer__min_df'] = [1, 2]
params['logisticregression__C'] = [0.1, 1, 10]
params['logisticregression__penalty'] = ['l1', 'l2']

In [13]:
# try all possible combinations of those parameter values
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X, y);

In [14]:
# what was the best score found during the search?
grid.best_score_

0.8215177954930638

In [15]:
# which combination of parameters produced the best score?
grid.best_params_

{'columntransformer__countvectorizer__min_df': 1,
 'logisticregression__C': 1,
 'logisticregression__penalty': 'l1'}