# Example - 1

In [3]:
from sklearn.pipeline import Pipeline
# feature scaling
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [5]:
steps=[("standardscaler",StandardScaler()),
       ("classifier",LogisticRegression())]

In [6]:
steps

[('standardscaler', StandardScaler()), ('classifier', LogisticRegression())]

In [8]:
pipe=Pipeline(steps)

In [9]:
# visualize pipeline 
from sklearn import set_config

In [10]:
set_config(display="diagram")

In [11]:
pipe

In [16]:
## create a dataset
from sklearn.datasets import make_classification
x,y  = make_classification(n_samples=1000)

In [17]:
x.shape

(1000, 20)

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.33)

In [21]:
pipe.fit(x_train,y_train)

In [22]:
y_pred=pipe.predict(x_test)

In [24]:
from sklearn.metrics import accuracy_score

In [27]:
Accuracy=accuracy_score(y_test,y_pred)

In [28]:
Accuracy

0.8393939393939394

# Example - 2

In [53]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [30]:
from sklearn.datasets import make_classification
x,y  = make_classification(n_samples=1000)

In [31]:
x.shape

(1000, 20)

In [43]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.33)

In [47]:
steps=[("scaling",StandardScaler()),
       ("PCA",PCA(n_components=3)),
       ("SVC",SVC())]

In [48]:
pipe2=Pipeline(steps)

In [49]:
pipe2

In [50]:
pipe2['scaling'].fit_transform(x_train) # we can only use scaling also from pipeline

array([[ 0.72290937,  0.97074155, -0.00533886, ..., -0.99447077,
        -1.37679551, -0.36539311],
       [-2.31559462, -0.50667867,  0.17928194, ..., -0.35498375,
        -0.80093679, -0.92124292],
       [-1.59575024,  0.68517273,  0.05343255, ..., -1.37853445,
        -1.027193  ,  2.18457331],
       ...,
       [-1.62146695,  0.4942925 , -1.12491813, ..., -0.04087034,
         1.00916978,  1.23487335],
       [ 0.23481529, -0.58708244,  0.6413339 , ..., -0.16516868,
        -0.51310423, -2.25082531],
       [ 1.40501788,  0.38840072,  2.01799943, ..., -0.14724995,
        -0.15884085,  1.38283919]])

In [51]:
pipe2.fit(x_train,y_train)

In [52]:
pipe2.predict(x_test)

array([1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,

# complex example of column transform

In [110]:
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

In [63]:
numeric_processor=Pipeline(
    steps=[("impoutation_mean",SimpleImputer(missing_values=np.nan,strategy="mean")),
           ("scaler",StandardScaler())])

In [65]:
numeric_processor

In [72]:
categorical_processor=Pipeline(
    steps=[("impoutation_mean",SimpleImputer(fill_value="missing",strategy="constant")),
           ("onehot",OneHotEncoder(handle_unknown="ignore"))])

In [73]:
categorical_processor

In [99]:
preprocessor=ColumnTransformer([
    ("categorical",categorical_processor,['Gender','City']),
     ("numerical",numeric_processor,['Age','Height'])])

In [100]:
preprocessor

In [101]:
final_pipe=make_pipeline(preprocessor,LogisticRegression())

In [102]:
final_pipe

# Hyperparameter tuning using Machine Learning Pipeline

In [118]:
import numpy as np
from sklearn .pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import seaborn as sns

In [119]:
df= sns.load_dataset('tips')

In [121]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [123]:
x=df.iloc[:,1:]
y=df["total_bill"]

In [124]:
from sklearn.model_selection import train_test_split

In [126]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [128]:
#pipelining
numeric_processor=Pipeline(
    steps=[('imputation_mean',SimpleImputer(missing_values=np.nan,strategy="mean")),
           ("scale",StandardScaler())])

In [132]:
numeric_processor

In [129]:
categorical_processor=Pipeline(
    steps=[('imputation_mean',SimpleImputer(fill_value="missing",strategy="constant")),
           ("onehot",OneHotEncoder(handle_unknown="ignore"))])

In [133]:
categorical_processor

In [134]:
preprocessor=ColumnTransformer([
    ("categorical",categorical_processor,['sex','smoker','day','time']),
    ("numerical",numeric_processor,['tip','size'])])

In [135]:
preprocessor

In [136]:
pipe=Pipeline(
    steps=[('preprocessor',preprocessor),("regressor",RandomForestRegressor())])

In [137]:
pipe

In [138]:
from sklearn import set_config

In [139]:
set_config(display='diagram')

In [140]:
pipe

In [141]:
pipe.fit(x_train,y_train)

In [142]:
pipe.predict(x_test)

array([29.7454    , 11.84748143, 10.94951   , 20.19308333, 18.01600167,
       19.5064    , 22.19985833, 31.4125    , 11.218     , 11.43403333,
       28.49062   , 27.15635833, 13.664175  , 12.88554524, 44.6479    ,
       16.5572    , 18.42205833, 10.97688524, 21.82984429, 23.07430833,
       22.819525  , 10.3326    , 15.37976333, 19.831     , 25.8445    ,
       14.42343833, 10.67094762, 19.3618    , 18.77423452, 20.90661667,
       17.47633619, 35.8489    , 28.95665   , 18.7786    , 18.7745    ,
       18.66163333, 21.93985   , 27.74622333, 21.28098333, 21.2014    ,
       26.79895   , 20.1972575 , 12.01078   , 18.77423452, 16.56186389,
       12.09898071, 17.0056    , 14.26872857, 16.724     ])

In [143]:
import warnings
warnings.filterwarnings('ignore')

In [152]:
## hyperparamter tuning
param={
    "regressor__n_estimators":[200,500],
    "regressor__max_features":["auto",'sqrt','log2'],
    "regressor__max_depth":[4,5,6,7,8]
}

In [153]:
grid_search=GridSearchCV(pipe,param_grid=param,n_jobs=1)

In [155]:
grid_search.fit(x_train,y_train)

In [157]:
grid_search.best_params_

{'regressor__max_depth': 5,
 'regressor__max_features': 'auto',
 'regressor__n_estimators': 200}

In [160]:
pipe=Pipeline(
    steps=[('preprocessor',preprocessor),("regressor",RandomForestRegressor(max_depth=5,max_features="auto",n_estimators=200))])

In [161]:
pipe.fit(x_train,y_train)

In [162]:
pipe.predict(x_test)

array([27.6166511 , 13.22844022, 12.44326927, 16.42345729, 17.136694  ,
       19.86981179, 20.53779467, 30.69741932, 13.54380086, 12.05014511,
       26.61343456, 22.94005727, 12.40199062, 13.47975207, 42.79547127,
       15.83156688, 18.80422553, 11.93509163, 21.86582544, 20.83487082,
       23.04373945, 11.03506086, 14.9050187 , 20.66510416, 27.47381247,
       15.8909649 , 11.501178  , 20.03316665, 19.33104921, 21.46159031,
       18.98027737, 33.79346527, 28.31008682, 19.78286179, 17.90381685,
       19.5692025 , 24.07256359, 27.00695939, 21.67699495, 20.08754069,
       23.58928333, 19.94742193, 12.91200947, 19.33104921, 16.13753021,
       12.70852273, 18.71175842, 12.80180722, 18.20284995])