# Pipline
`Pipeline` is a list of steps that are executed in order.this is sequence of data preprocessing steps that are chained together to automatically transform data before passing it to a machine learning model.This is allow to combine multiple data preprocessing steps into a single step that can be executed in a single line of code.
## Key components of Pipeline
1) Data preprocessing steps.
2) Model training
3) Model evaluation
4) Model pridiction
### Advantages of Pipeline
1) simpified workflow
2) Avoiding data leakage 
3) streamlined model deployment
4) Hyperparameter tuning

In [71]:
# import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [72]:
# load titanic dataset
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [73]:
# select features and target variable
x = df [['pclass', 'sex', 'age','fare', 'embarked']]
y = df['survived']

In [74]:
# split train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

In [75]:
# Define the column transformer for imputing missing values
numeric_features = ['age', 'fare']
categorical_features = ['pclass', 'sex', 'embarked']
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)])


In [76]:
# create pipeline with the preprocessor and  random forest classifier
# pipeline = Pipeline(steps=[('preprocessor', preprocessing), ('classifier', RandomForestClassifier(random_state=42))])
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier(random_state=42))])

In [77]:
# fit the pipeline
pipeline.fit(x_train, y_train)
# make predictions
y_pred = pipeline.predict(x_test)
# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8324022346368715


# Hyperparameter Tuning in pipeline

In [98]:
# import Dataset 
df2 = sns.load_dataset('titanic')
df2.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [99]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [100]:
# define x and y 
X = df2[['pclass', 'sex', 'age','fare', 'embarked']]
Y = df2['survived']

In [101]:
# split train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=4)

In [105]:
# create a pipeline 
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [106]:
# define the hypermeters to tune
hypermeters = {
    'classifier__n_estimators': [10, 50, 100],
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_split': [2, 5, 10]
}


In [107]:
# perform grid search
grid_search = GridSearchCV(pipeline, hypermeters, cv=5)
grid_search.fit(X_train, Y_train)

In [108]:
# get the best model 
best_model = grid_search.best_estimator_

In [109]:
# make predictions
y_pred = best_model.predict(X_test)
# calculate accuracy
accuracy = accuracy_score(Y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8379888268156425


In [110]:
# print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

Best hyperparameters: {'classifier__max_depth': 10, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}
