# sklearn pipeline - end to end soultion 

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [4]:
# when we do steps we have to give in key value pair 
steps=[('standard_scalar',StandardScaler()),
       ('classifier',LogisticRegression())]

In [7]:
pipe=Pipeline(steps=steps)
pipe 

In [8]:
# visualize the pipeline 
from sklearn import set_config
set_config(display='diagram')

In [21]:
# creating a dataset 
from sklearn.datasets import make_classification 
x,y=make_classification(n_samples=1000)

In [22]:
from sklearn.model_selection import train_test_split 
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.33,random_state=42)

In [24]:
pipe.fit(xtrain,ytrain)

In [25]:
pipe.predict(xtest)

array([0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,

# examples 2 
- pipeline with standard scalar and dimensionalty reduction and estimator 

In [32]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC 

In [36]:
steps=[
    ('scaling',StandardScaler()),
    ('PCA',PCA(n_components=3)),
    ('SVC',SVC())
]
pipe=Pipeline(steps)

In [37]:
pipe

In [40]:
pipe['PCA'].fit_transform(xtrain).shape

(670, 3)

## complex examples of columns transformer 

In [43]:
# handle the missing values 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder 
import numpy as np

In [44]:
# numerical processing pipeline 
numeric_processor=Pipeline(
    steps=[('imputation',SimpleImputer(missing_values=np.nan,strategy='median')),
           ('standard_scaling',StandardScaler())
           ]
)

In [52]:
# categorical processing pipeline 
categorical_processor=Pipeline(
    steps=[('imputation',SimpleImputer(fill_value='missing',strategy='constant')),
           #('scalar',StandardScaler()),
           ('encoding',OneHotEncoder(handle_unknown='ignore'))

    ]
)
categorical_processor

In [53]:
# combine processing techniques
from sklearn.compose import ColumnTransformer

preprocessor=ColumnTransformer(
    [
        ('categorical',categorical_processor,['gender','city']),
        ('numeric_processor',numeric_processor,['age','height'])
    ]
)

In [54]:
preprocessor

In [60]:
pipe=Pipeline(
    [
        ('preprocessor',preprocessor),
        ('model',LogisticRegression())
    ]
)
pipe 

In [59]:
from sklearn.pipeline import make_pipeline

make_pipeline(preprocessor,LogisticRegression())

# hyperparaeter tuning with gridsearch CV

In [69]:
import numpy as np
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split

import seaborn as sns


In [64]:
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [65]:
df.isna().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [67]:
x=df.drop(columns='total_bill')
y=df['total_bill']

In [73]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=43)

In [76]:
# pipeline method 1
categorical_columns=['sex','smoker','day','time']
numerical_columns=['tip','size']
preprocessor=ColumnTransformer(
    transformers=[
        ('categorical_varaibles',OneHotEncoder(),categorical_columns),
        ('numerical_varaibles',SimpleImputer(strategy='median'),numerical_columns)
    ]
)
pipe=Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',RandomForestRegressor())
    ]
)
pipe.fit(xtrain,ytrain)

In [77]:
ypred=pipe.predict(xtest)
ypred

array([20.5252    , 18.1955    , 19.0226    , 30.5018    , 18.7554    ,
       30.1448    , 13.1852125 , 14.27854   , 20.463     , 20.8487    ,
       17.3837    , 10.71033286, 16.046     , 12.690745  , 14.64581786,
       18.7554    , 13.53021667, 27.7007    , 21.65088333, 16.83109286,
       18.14597333, 18.69453333, 10.58022143, 20.3152    , 23.14928333,
       20.0085    , 28.4326    , 31.5484    , 24.4707    , 23.93438333,
       12.70093643,  9.5092    , 14.35472667, 14.38629476, 23.7252    ,
       20.6394    , 30.5199    , 12.61203333, 21.95377   , 11.22783333,
       14.64635   , 30.9793    , 18.463     , 22.51414167, 21.5528    ,
       21.72984167,  8.87624667, 16.75812   , 21.26523571])

In [81]:
# pipeline method 2 

numerical_preprocessor=Pipeline(
    steps=[
        ('imputation_mean',SimpleImputer(missing_values=np.nan, strategy='median')),
        ('scalar',StandardScaler())
    ]
)

categorical_processor=Pipeline(
    steps=[
        ('imputation_constant',SimpleImputer(missing_values='missing',strategy='constant')),
        ('encode',OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor=ColumnTransformer(
    transformers=[
        ('categorical_columns',categorical_processor,categorical_columns),
        ('numerical_columns',numerical_preprocessor,numerical_columns),
        
    ]
)
preprocessor

In [82]:
pipe=Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',RandomForestRegressor())
    ]
)
pipe 

In [84]:
pipe.fit(xtrain,ytrain)
pipe.predict(xtest)

array([20.50465   , 18.499125  , 18.3128    , 30.1771    , 18.742825  ,
       30.8627    , 13.06265   , 14.47968   , 20.4375    , 20.5734    ,
       18.35253333, 10.946235  , 15.54523333, 13.297485  , 15.084365  ,
       18.742825  , 13.28122738, 28.5916    , 21.28358333, 17.159675  ,
       17.2993    , 19.232875  , 10.63717667, 20.3879    , 23.11399167,
       19.52115   , 29.7927    , 32.3279    , 27.3051    , 23.50591667,
       12.63114583,  8.096     , 14.27446667, 13.99777619, 24.008     ,
       21.1623    , 31.8963    , 12.94460119, 22.43183333,  9.5177    ,
       15.55715   , 31.5652    , 18.7075    , 22.82655833, 21.5681    ,
       21.61318333,  8.92840667, 16.55181667, 22.3629    ])

# hyperparameter tuning 

In [91]:
param_grid={
    'model__n_estimators': [200,500],
    'model__max_features': ['sqrt','log2'],
    'model__max_depth': [4,5,6,7,8],
    'model__criterion': ['squared_error', 'poisson', 'friedman_mse', 'absolute_error'],
}

grid_search=GridSearchCV(pipe,param_grid=param_grid,n_jobs=1)

grid_search.fit(xtrain,ytrain)

In [88]:
grid_search.best_params_

{'model__max_depth': 6,
 'model__max_features': 'log2',
 'model__n_estimators': 500}

In [92]:
pipe=Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',RandomForestRegressor(max_depth=6,n_estimators=500,max_features='log2'))

    ]
)
pipe.fit(xtrain,ytrain)
pipe.predict(xtest) 

array([19.7777607 , 16.9027383 , 20.45073286, 28.21802103, 18.72238466,
       29.73678336, 13.19734034, 13.80708602, 17.64720336, 18.27897953,
       16.71639753, 11.95837158, 17.36509662, 15.18300847, 16.05212886,
       18.72238466, 13.97766088, 25.50451811, 21.2305907 , 19.02177037,
       19.05049153, 20.05921529, 13.22814907, 19.81052984, 23.18484185,
       20.25521174, 26.89986291, 29.88886282, 25.04035602, 21.71975096,
       12.72013244, 12.14831905, 12.56755336, 14.07801014, 25.49461876,
       20.21154969, 31.01201634, 13.37275099, 18.49534258, 12.98998301,
       17.07434714, 30.13105675, 18.00561551, 20.11372366, 20.95376411,
       21.33792697, 11.84983283, 17.21373807, 17.86858808])