## Pipelines explained using iris dataset

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
iris_df=load_iris()

In [3]:
iris_df.data[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [4]:
X_train,X_test,y_train,y_test=train_test_split(iris_df.data,iris_df.target,test_size=0.3,random_state=0)

## Pipelines Creation
    1. Data Preprocessing by using Standard Scaler
    2. Reduce Dimension using PCA
    3. Apply  Classifier

In [5]:
pipeline_lr=Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)),
                     ('lr_classifier',LogisticRegression(random_state=0))])

In [6]:
pipeline_dt=Pipeline([('scalar2',StandardScaler()),
                     ('pca2',PCA(n_components=2)),
                     ('dt_classifier',DecisionTreeClassifier())])

In [7]:
pipeline_randomforest=Pipeline([('scalar3',StandardScaler()),
                     ('pca3',PCA(n_components=2)),
                     ('rf_classifier',RandomForestClassifier())])

In [8]:
## Lets make the list of pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_randomforest]

In [9]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [10]:
# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [11]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(X_test,y_test)))

Logistic Regression Test Accuracy: 0.8666666666666667
Decision Tree Test Accuracy: 0.9111111111111111
RandomForest Test Accuracy: 0.9111111111111111


In [12]:
for i,model in enumerate(pipelines):
    if model.score(X_test,y_test) > best_accuracy:
        best_accuracy=model.score(X_test,y_test)
        best_pipeline=model
        best_classifier=i
print('Classifier with best accuracy is \"{}\" with accuracy score {}.'.format(pipe_dict[best_classifier],best_accuracy))

Classifier with best accuracy is "Decision Tree" with accuracy score 0.9111111111111111.


## Pipelines Perform Hyperparameter Tuning Using Grid SearchCV

In [13]:
import numpy as np
from sklearn.model_selection import GridSearchCV

In [14]:
# Create a pipeline
pipe = Pipeline([("classifier", RandomForestClassifier())])
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2','l1'],
                 "classifier__C": np.logspace(0, 4, 10)
                 },
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2'],
                 "classifier__C": np.logspace(0, 4, 10),
                 "classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
                 },
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_depth":[5,8,15,25,30,None],
                 "classifier__min_samples_leaf":[1,2,5,10,15,100],
                 "classifier__max_leaf_nodes": [2, 5,10]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

50 fits failed out of a total of 1920.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 4

In [15]:
pipe

Pipeline(steps=[('classifier', RandomForestClassifier())])

In [16]:
gridsearch

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('classifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid=[{'classifier': [LogisticRegression()],
                          'classifier__C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04]),
                          'classifier__penalty': ['l2', 'l...
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04]),
                          'classifier__penalty': ['l2'],
                          'classifier__solver': ['newton-cg', 'saga', 'sag',
                                                 'liblinear']},
                         {'classifier': [RandomForestClassifier()],
                          'classifier__max_depth': [5, 8, 15, 25, 30, None],
                          '

In [17]:
# convert results into a DataFrame
import pandas as pd
results = pd.DataFrame(best_model.cv_results_)[['params', 'mean_test_score', 'rank_test_score']]
results

Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'classifier': LogisticRegression(), 'classifi...",0.952381,44
1,"{'classifier': LogisticRegression(), 'classifi...",,375
2,"{'classifier': LogisticRegression(), 'classifi...",0.942857,83
3,"{'classifier': LogisticRegression(), 'classifi...",,376
4,"{'classifier': LogisticRegression(), 'classifi...",0.942857,83
...,...,...,...
379,"{'classifier': RandomForestClassifier(), 'clas...",0.933333,213
380,"{'classifier': RandomForestClassifier(), 'clas...",0.942857,83
381,"{'classifier': RandomForestClassifier(), 'clas...",0.361905,369
382,"{'classifier': RandomForestClassifier(), 'clas...",0.371429,321


In [18]:
# sort by test score
results.sort_values('rank_test_score')

Unnamed: 0,params,mean_test_score,rank_test_score
25,{'classifier': LogisticRegression(solver='saga...,0.980952,1
22,{'classifier': LogisticRegression(solver='saga...,0.980952,1
21,{'classifier': LogisticRegression(solver='saga...,0.980952,1
26,{'classifier': LogisticRegression(solver='saga...,0.980952,1
57,{'classifier': LogisticRegression(solver='saga...,0.980952,1
...,...,...,...
15,"{'classifier': LogisticRegression(), 'classifi...",,380
17,"{'classifier': LogisticRegression(), 'classifi...",,381
19,"{'classifier': LogisticRegression(), 'classifi...",,382
5,"{'classifier': LogisticRegression(), 'classifi...",,383


In [19]:
print(best_model.best_estimator_)
print(best_model.best_params_)
print(best_model.best_score_) # train-set accuracy score

Pipeline(steps=[('classifier', LogisticRegression(solver='saga'))])
{'classifier': LogisticRegression(solver='saga'), 'classifier__C': 1.0, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
0.980952380952381


In [20]:
# test-set accuracy score
print("The mean accuracy of the model is:",best_model.score(X_test,y_test))

The mean accuracy of the model is: 0.9555555555555556


## Using MakePipelines In Sklearn

In [21]:
from sklearn.pipeline import make_pipeline

In [22]:
# Create a pipeline
pipe = make_pipeline((RandomForestClassifier()))
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"randomforestclassifier": [RandomForestClassifier()],
                 "randomforestclassifier__n_estimators": [10, 100, 1000],
                 "randomforestclassifier__max_depth":[5,8,15,25,30,None],
                 "randomforestclassifier__min_samples_leaf":[1,2,5,10,15,100],
                 "randomforestclassifier__max_leaf_nodes": [2, 5,10]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

In [23]:
pipe

Pipeline(steps=[('randomforestclassifier', RandomForestClassifier())])

In [24]:
gridsearch

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('randomforestclassifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid=[{'randomforestclassifier': [RandomForestClassifier(max_depth=25,
                                                                            max_leaf_nodes=5,
                                                                            min_samples_leaf=5,
                                                                            n_estimators=10)],
                          'randomforestclassifier__max_depth': [5, 8, 15, 25,
                                                                30, None],
                          'randomforestclassifier__max_leaf_nodes': [2, 5, 10],
                          'randomforestclassifier__min_samples_leaf': [1, 2, 5,
                                                                       10, 15,
                                                       

In [25]:
# convert results into a DataFrame
import pandas as pd
results = pd.DataFrame(best_model.cv_results_)[['params', 'mean_test_score', 'rank_test_score']]
results

Unnamed: 0,params,mean_test_score,rank_test_score
0,{'randomforestclassifier': RandomForestClassif...,0.876190,251
1,{'randomforestclassifier': RandomForestClassif...,0.914286,217
2,{'randomforestclassifier': RandomForestClassif...,0.914286,217
3,{'randomforestclassifier': RandomForestClassif...,0.761905,269
4,{'randomforestclassifier': RandomForestClassif...,0.828571,259
...,...,...,...
319,{'randomforestclassifier': RandomForestClassif...,0.952381,6
320,{'randomforestclassifier': RandomForestClassif...,0.942857,49
321,{'randomforestclassifier': RandomForestClassif...,0.371429,271
322,{'randomforestclassifier': RandomForestClassif...,0.371429,271


In [26]:
# sort by test score
results.sort_values('rank_test_score')

Unnamed: 0,params,mean_test_score,rank_test_score
186,{'randomforestclassifier': RandomForestClassif...,0.971429,1
82,{'randomforestclassifier': RandomForestClassif...,0.961905,2
45,{'randomforestclassifier': RandomForestClassif...,0.961905,2
103,{'randomforestclassifier': RandomForestClassif...,0.961905,2
309,{'randomforestclassifier': RandomForestClassif...,0.961905,2
...,...,...,...
195,{'randomforestclassifier': RandomForestClassif...,0.361905,319
105,{'randomforestclassifier': RandomForestClassif...,0.361905,319
15,{'randomforestclassifier': RandomForestClassif...,0.352381,322
87,{'randomforestclassifier': RandomForestClassif...,0.352381,322


In [27]:
print(best_model.best_estimator_)
print(best_model.best_params_)
print(best_model.best_score_) # train-set accuracy score

Pipeline(steps=[('randomforestclassifier',
                 RandomForestClassifier(max_depth=25, max_leaf_nodes=5,
                                        min_samples_leaf=5, n_estimators=10))])
{'randomforestclassifier': RandomForestClassifier(max_depth=25, max_leaf_nodes=5, min_samples_leaf=5,
                       n_estimators=10), 'randomforestclassifier__max_depth': 25, 'randomforestclassifier__max_leaf_nodes': 5, 'randomforestclassifier__min_samples_leaf': 5, 'randomforestclassifier__n_estimators': 10}
0.9714285714285715


In [28]:
# test-set accuracy score
best_model.score(X_test,y_test)

0.9777777777777777