In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer


#load the  data
df=sns.load_dataset('titanic')

#select the features and the target data
X=df[['pclass','age','sex','fare','embarked']]
y=df['survived']

#split the data into y=tain and test sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

#Define  the columntransformer for imputing missing values
numeric_features=['age','fare']
categorical_features=['pclass','embarked','sex']


numeric_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median'))
])

categorical_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder
     (handle_unknown='ignore'))
])

preprocessor=ColumnTransformer(
    transformers=[
        ('num',numeric_transformer,numeric_features),
        ('cat',categorical_transformer,categorical_features)
    ])

#create the pipeline with the preprocessor and Random forest classifier
Pipeline = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('classifier',RandomForestClassifier(random_state=42))
])

#fit the pipeline to the training data

Pipeline.fit(X_train,y_train)


#make predictions on the test data

y_pred=Pipeline.predict(X_test)


#evaluate the model
accuracy=accuracy_score(y_test,y_pred)
print('accuracy_score',accuracy)




accuracy_score 0.7821229050279329


## Code Explanation:

IN this example;we start by loading the titanic dataset from seaborn using sns.load_datset('titanic').WE then select the relevant features and target variables (survived) to train our model.Next,we split the data into training and test sets using train_test_split from scikit-learn.

The Pipeline is created using the pipleline class from sklearn.it consist of three steps

`Data preprocessing step:`The simpleimputer is used to handle missing values by replacing them with the most frequent value in each column

`Feature engineering step:`The onehotencoder is used to convert categorical variables (sex,embarked)as binary features.

`Model training step:`The RandomForestClassifier is used as Machine learning model for classification.

We then fit the pipeline on the training data using pipleline.fit(x_train,y_train).Afterward, We make predictions on the test data using pipeline.predict(`X_test`).

Finally, we calculate the accuracy score by comparing the predicted values(`y_pred,y_test`) with the actual values(`y_test`).




# Hyperparameter Tunning in Pipeline

In [3]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

#load the titanic dataset from seaborn

df=sns.load_dataset('titanic')

#select the feature and target  variable

X=df[['pclass','age','sex','fare','embarked']]
y=df['survived']

#split the data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

#create a pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore')),
    ('model',RandomForestClassifier(random_state=42))

])

#define the hyperparameters
hyperparameters={
    'model__n_estimators':[100,200,300],
    'model__max_depth':[None,5,10],
    'model__min_samples_split':[2,5,10]
}

#perform grid search cross validatiopn
grid_search= GridSearchCV(pipeline,hyperparameters,cv=5)
grid_search.fit(X_train,y_train)

#get the best model
best_model=grid_search.best_estimator_

#make pprediction on the test data  using the best model
y_pred=best_model.predict(X_test)

#calculate the accuracy score

accuracy=accuracy_score(y_test,y_pred)
print('accuracy_score',accuracy)

#print the best hyperparametrs
print('best hyperparameters',grid_search.best_params_)


accuracy_score 0.8324022346368715
best hyperparameters {'model__max_depth': None, 'model__min_samples_split': 5, 'model__n_estimators': 300}
