# Pipeline Examples

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

### loading dataset:

In [2]:
iris_df = load_iris(as_frame=True)

In [3]:
# print(iris_df.DESCR)

In [4]:
iris_df.data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


### train test split:

In [5]:
X_train, X_test, y_train, y_test = train_test_split(iris_df.data, iris_df.target, test_size=0.3, random_state=0)

### forming the pipeline:

sequentially is applied the list of transforms ('scalar', 'pca') and the final estimator ('lr_classifier')

In [6]:
pipeline_lr = Pipeline([
    ('scalar', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('lr_classifier', LogisticRegression(random_state=0))], verbose=True)

### fitting the model with train dataset:

In [7]:
pipeline_lr.fit(X_train, y_train)

[Pipeline] ............ (step 1 of 3) Processing scalar, total=   0.0s
[Pipeline] ............... (step 2 of 3) Processing pca, total=   0.1s
[Pipeline] ..... (step 3 of 3) Processing lr_classifier, total=   0.0s


Pipeline(steps=[('scalar', StandardScaler()), ('pca', PCA(n_components=2)),
                ('lr_classifier', LogisticRegression(random_state=0))],
         verbose=True)

### predicting the labels on test dataset:

In [8]:
y_predicted = pipeline_lr.predict(X_test)
y_predicted

array([2, 1, 0, 2, 0, 2, 0, 2, 2, 1, 2, 1, 1, 2, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 1, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0,
       0])

### let's evaluate the accuracy of our predictions:

In [9]:
from sklearn.metrics import accuracy_score

In [10]:
accuracy_score(y_true=y_test, y_pred=y_predicted)

0.8666666666666667

### the same can be done with .score method:

In [11]:
pipeline_lr.score(X_test, y_test)

0.8666666666666667

### if probabilities are required:

In [12]:
pipeline_lr.predict_proba(X_test)

array([[5.71391047e-04, 2.25691282e-01, 7.73737327e-01],
       [9.07005837e-03, 7.99234760e-01, 1.91695182e-01],
       [9.95432635e-01, 4.56703504e-03, 3.30225956e-07],
       [1.70793933e-05, 2.98519860e-02, 9.70130935e-01],
       [9.75084659e-01, 2.49123389e-02, 3.00255799e-06],
       [1.95994249e-04, 8.83237178e-02, 9.11480288e-01],
       [9.80767896e-01, 1.92300790e-02, 2.02526315e-06],
       [6.67505151e-03, 4.88820889e-01, 5.04504060e-01],
       [2.06817723e-03, 3.62669049e-01, 6.35262774e-01],
       [2.55670606e-02, 8.08133529e-01, 1.66299410e-01],
       [1.58955598e-03, 3.93389407e-01, 6.05021037e-01],
       [2.11273201e-02, 6.80066876e-01, 2.98805804e-01],
       [1.59451657e-02, 7.45674877e-01, 2.38379958e-01],
       [3.91022037e-03, 4.79549225e-01, 5.16540555e-01],
       [1.37490141e-02, 6.97185910e-01, 2.89065076e-01],
       [9.90255511e-01, 9.74395514e-03, 5.33528196e-07],
       [1.59065474e-02, 7.24572325e-01, 2.59521127e-01],
       [2.85595743e-02, 8.67483

### let's create pipelines with different classifiers and establish which one is better:

In [13]:
# linear regression pipeline
pipeline_lr = Pipeline([('scalar1',StandardScaler()),
                        ('pca1',PCA(n_components=2)),
                        ('lr_classifier', LogisticRegression(random_state=0))])

# decision tree pipeline
pipeline_dt = Pipeline([('scalar2',StandardScaler()),
                        ('pca2',PCA(n_components=2)),
                        ('dt_classifier', DecisionTreeClassifier())])

# random forest pipeline
pipeline_rf = Pipeline([('scalar3',StandardScaler()),
                        ('pca3',PCA(n_components=2)),
                        ('rf_classifier', RandomForestClassifier())])

# let's make the list of pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_rf]

# dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}

# fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

# establishing accuracy of each pipeline/model:
for i, model in enumerate(pipelines):
    print(f'{pipe_dict[i]} Test Accuracy: {round(model.score(X_test, y_test), 2)}')

# establishing pipeline/model with best accuracy:
best_accuracy = 0.0
best_classifier = 0
best_pipeline = ""
for i, model in enumerate(pipelines):
    if model.score(X_test, y_test) > best_accuracy:
        best_accuracy = model.score(X_test, y_test)
        best_pipeline = model
        best_classifier = i
print(f'\nClassifier with best accuracy: {pipe_dict[best_classifier]}')

Logistic Regression Test Accuracy: 0.87
Decision Tree Test Accuracy: 0.91
RandomForest Test Accuracy: 0.91

Classifier with best accuracy: Decision Tree


# Pipelines Perform Hyperparameter Tuning Using Grid SearchCV

In [16]:
import numpy as np
from sklearn.model_selection import GridSearchCV

In [17]:
# Create a pipeline
pipe = Pipeline([("classifier", RandomForestClassifier())])

# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2', 'l1'],
                 "classifier__C": np.logspace(0, 4, 10)
                 },
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2'],
                 "classifier__C": np.logspace(0, 4, 10),
                 "classifier__solver":['newton-cg', 'saga', 'sag', 'liblinear'] ##This solvers don't allow L1 penalty
                 },
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_depth":[5, 8, 15, 25, 30, None],
                 "classifier__min_samples_leaf":[1, 2, 5, 10, 15, 100],
                 "classifier__max_leaf_nodes": [2, 5, 10]}]

# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0, n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train, y_train)

 0.96190476        nan 0.97142857        nan 0.96190476        nan
 0.96190476        nan 0.95238095        nan 0.95238095        nan
 0.95238095        nan 0.95238095 0.98095238 0.98095238 0.94285714
 0.94285714 0.98095238 0.98095238 0.97142857 0.94285714 0.98095238
 0.98095238 0.97142857 0.96190476 0.98095238 0.98095238 0.98095238
 0.97142857 0.98095238 0.98095238 0.98095238 0.96190476 0.98095238
 0.98095238 0.97142857 0.96190476 0.98095238 0.98095238 0.97142857
 0.96190476 0.98095238 0.98095238 0.97142857 0.96190476 0.98095238
 0.98095238 0.97142857 0.96190476 0.98095238 0.98095238 0.97142857
 0.8952381  0.91428571 0.93333333 0.95238095 0.91428571 0.92380952
 0.88571429 0.9047619  0.93333333 0.8952381  0.9047619  0.91428571
 0.8952381  0.91428571 0.93333333 0.37142857 0.37142857 0.37142857
 0.94285714 0.94285714 0.94285714 0.93333333 0.95238095 0.94285714
 0.94285714 0.94285714 0.94285714 0.93333333 0.94285714 0.94285714
 0.95238095 0.95238095 0.95238095 0.36190476 0.37142857 0.3714

In [18]:
print(best_model.best_estimator_)
print("The mean accuracy of the model is:",best_model.score(X_test,y_test))

Pipeline(steps=[('classifier', LogisticRegression(solver='saga'))])
The mean accuracy of the model is: 0.9555555555555556


In [25]:
best_model.best_params_

{'classifier': LogisticRegression(solver='saga'),
 'classifier__C': 1.0,
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga'}