# Pipelines Helper Example
With this class, elements of a scikit pipeline can be hot-swapped for grid search, along with their parameters.
1. It can hold regular Pipeline objects
2. No need to specify default parameters
3. Can be used for classifiers or other elements of a pipelines

In [1]:
# import libraries
from sklearn import datasets
from sklearn.ensemble import (
RandomForestClassifier,
AdaBoostClassifier,
GradientBoostingClassifier
)
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier

from pipelinehelper import PipelineHelper

In [2]:
data = datasets.load_iris()

In [3]:
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [4]:
data.data


array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [5]:
data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [16]:
pipe = Pipeline([
    ('scaler', PipelineHelper([
        ('std', StandardScaler()),
        ('max', MaxAbsScaler()),
    ], optional=True)),
    ('classifier', PipelineHelper([
        ('svm', SVC()),
        ('rf', RandomForestClassifier()),
        ('ada', AdaBoostClassifier()),
        ('gb', GradientBoostingClassifier()),
        ('knn', KNeighborsClassifier()),
        ('xgb', XGBClassifier()),
        ('nb_pipe', Pipeline([
            ('scaler', MinMaxScaler()),
            ('nb', MultinomialNB()),
        ])),
    ])),
])

In [19]:
params = {
    'scaler__selected_model': pipe.named_steps['scaler'].generate(
        {
            'std__with_mean': [True, False],
            'std__with_std': [True, False],
            # no parameters for 'max' lead to use standard params
        }
    ),
        'classifier__selected_model': pipe.named_steps['classifier'].generate(
        {
            'svm__C': [0.1, 1.0],
            'svm__kernel': ['linear', 'rbf'],
            'rf__n_estimators': [10, 20, 50, 100, 150],
            'rf__max_features': ['auto', 'sqrt', 'log2'],
            'rf__min_samples_split': [2, 5, 10],
            'rf__bootstrap': [True, False],
            'ada__n_estimators': [10, 20, 40, 100],
            'ada__algorithm': ['SAMME', 'SAMME.R'],
            'gb__n_estimators': [10, 20, 50, 100],
            'gb__criterion': ['friedman_mse', 'mse', 'mae'],
            'gb__max_features': ['auto', 'sqrt', None],
            'knn__n_neighbors': [2, 3, 5, 7, 10],
            'knn__leaf_size': [1, 2, 3, 5],
            'knn__weights': ['uniform', 'distance'],
            'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'nb_pipe__nb__fit_prior': [True, False],
            'nb_pipe__nb__alpha': [0.1, 0.2],
                        'xgb__max_depth': [3, 6, 9, 12, 15],
            'xgb__min_child_weight': [1, 2, 4, 8],
            'xgb__subsample': [0.1, 0.2, 0.5, 0.8, 1],
            'xgb__tree_method':['gpu_hist', 'hist'],
        }
    ),

}

In [20]:
grid = GridSearchCV(pipe, param_grid=params, scoring='accuracy', verbose=1, n_jobs=-1)
grid.fit(data.data, data.target)

Fitting 5 folds for each of 3012 candidates, totalling 15060 fits


GridSearchCV(estimator=Pipeline(steps=[('scaler',
                                        PipelineHelper(available_models={'max': MaxAbsScaler(),
                                                                         'std': StandardScaler()},
                                                       optional=True)),
                                       ('classifier',
                                        PipelineHelper(available_models={'ada': AdaBoostClassifier(),
                                                                         'gb': GradientBoostingClassifier(),
                                                                         'knn': KNeighborsClassifier(),
                                                                         'nb_pipe': Pipeline(steps=[('scaler',
                                                                                                     MinMaxScaler()),
                                                                                     

In [21]:

print(grid.best_params_)
print(grid.best_score_)


{'classifier__selected_model': ('knn', {'algorithm': 'auto', 'leaf_size': 1, 'n_neighbors': 10, 'weights': 'distance'}), 'scaler__selected_model': ('std', {'with_mean': True, 'with_std': False})}
0.9866666666666667


In [15]:
'regression__selected_model': pipe.named_steps['regressor'].generate()grid.best_estimator_

Pipeline(steps=[('scaler',
                 PipelineHelper(available_models={'max': MaxAbsScaler(),
                                                  'std': StandardScaler(with_std=False)},
                                optional=True,
                                selected_model=StandardScaler(with_std=False))),
                ('classifier',
                 PipelineHelper(available_models={'ada': AdaBoostClassifier(),
                                                  'gb': GradientBoostingClassifier(),
                                                  'knn': KNeighborsClassifier(leaf_size=1,
                                                                              n_neighbors=10,
                                                                              weights=...
                                                                       max_bin=None,
                                                                       max_cat_to_onehot=None,
                               