In [29]:
#import libraries
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score

import warnings

warnings.filterwarnings('ignore')

An example of pipeline to process, trainn and predictions.

In [4]:
#load the data
df=sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


Step 1 & 2

In [9]:
#selct the features and target
X=df[['pclass','sex','age','fare','embarked','class']]
y=df['survived']

#split the data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

Getting data ready for handling missing values

In [10]:
#defining colums types for imputation
nuemeric_features=['age','fare']
categorical_features=['pclass','sex','embarked','class']

Step 3 Pipeline steps for data preprocessing

In [11]:
numeric_tranformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median'))
])

categorical_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

In [12]:
preprocessor=ColumnTransformer(
    transformers=[
        ('num',numeric_tranformer,nuemeric_features),
        ('cat',categorical_transformer,categorical_features)
    ]
)
#create a pipeline
pipeline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('classifier',RandomForestClassifier())
])

Fit and Predictions

In [13]:
#fit the pipeline
pipeline.fit(X_train,y_train)

#make predictions
y_pred=pipeline.predict(X_test)

#evaluate the model
accuracy=accuracy_score(y_test,y_pred)

#print the accuracy
print('Accuracy:',accuracy)

Accuracy: 0.7877094972067039


**Explaination:**

In this example, we start by loading the Titanic dataset from Seaborn using sns.load_dataset('titanic'). We then select the relevant features and target variable (survived) to train our model. Next, we split the data into training and test sets using train_test_split from scikit-learn.

The pipeline is created using the Pipeline class from scikit-learn. It consists of three steps:

Data preprocessing step: The SimpleImputer is used to handle missing values by replacing them with the most frequent value in each column.

Feature encoding step: The OneHotEncoder is used to encode categorical variables (`sex and embarked`) as binary features.

Model training step: The RandomForestClassifier is used as the machine learning model for classification.

We then fit the pipeline on the training data using pipeline.fit(X_train, y_train). Afterward, we make predictions on the test data using pipeline.predict(`X_test`).

Finally, we calculate the accuracy score by comparing the predicted values (`y_pred`) with the actual values (`y_test`).

Note that you may need to install Seaborn (`pip install seaborn`) if it's not already installed in your environment.

----------------------------
# Hyperparamter tunning in pipeline

Hyperparameter tuning in a pipeline involves optimizing the hyperparameters of the different steps in the pipeline to find the best combination that maximizes the model's performance. Here's an example of hyperparameter tuning in a pipeline and selecting the best model on the Titanic dataset:

In [15]:
#load the data
titanic_data=sns.load_dataset('titanic')
titanic_data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [23]:
#se;ect features and targets
X=titanic_data[['pclass','sex','age','fare','embarked']]
y=titanic_data['survived']


#split the data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)



#create a pipeline
pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore')),
    ('classifier',RandomForestClassifier(random_state=42))
])

#defining hyperparameters
param_grid={
    
    'classifier__n_estimators':[100,200],
    'classifier__max_depth':[5,10],
    'classifier__min_samples_split':[2,5]
}

#fit the pipeline
grid_search=GridSearchCV(pipeline,param_grid=param_grid,cv=3)
grid_search.fit(X_train,y_train)

#get the best model
best_model=grid_search.best_estimator_

#make predictions
y_pred=best_model.predict(X_test)

#evaluate the model
accuracy=accuracy_score(y_test,y_pred)
print('Accuracy:',accuracy)

#print the best parameters
print('Best parameters:',grid_search.best_params_)


Accuracy: 0.7877094972067039
Best parameters: {'classifier__max_depth': 5, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}


# ** Add more models in the same code & Hyperparameters**

In [26]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [36]:
#select the features and target
X=df[['pclass','sex','age','fare','embarked']]
y=df['survived']

#split the data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

#models
models=[('Random Forest',RandomForestClassifier(random_state=42)),
        ('Gradient Boosting',GradientBoostingClassifier(random_state=42)),
        ('Support Vector Classifier',SVC(random_state=42)),
        ('Logistic Regression',LogisticRegression(random_state=42))
        ]
best_model=None
best_accuracy=0



In [45]:
#loop through the models
for name, model in models:
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ])
    #defining hyperparameters
    param_grid={
    
    'classifier__n_estimators':[100,200],
    #'classifier__max_depth':[5,10],
    #'classifier__min_samples_split':[2,5]
    }

#fit the pipeline
    grid_search=GridSearchCV(pipeline,param_grid=param_grid,cv=3)
    grid_search.fit(X_train,y_train)

    #get the best model
    best_para=grid_search.best_estimator_

    #perform cross validation
    cv_scores=cross_val_score(pipeline,X_train,y_train,cv=5)

    #mean accuracy
    mean_accuracy=cv_scores.mean()

    #fit the pipeline
    pipeline.fit(X_train,y_train)
    y_pred=pipeline.predict(X_test)

    #accuracy score
    accuracy=accuracy_score(y_test,y_pred)

    #print the results
    print('Model:',name)
    print('CV Mean Accuracy:',mean_accuracy)
    print('Accuracy:',accuracy)
    print()

    if accuracy>best_accuracy:
        best_model=pipeline
        best_accuracy=accuracy

print(f'The best modelsis :',best_model)
print(f'Best parameters:',best_para)

ValueError: Invalid parameter 'classifier' for estimator Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))]). Valid parameters are: ['memory', 'steps', 'verbose'].

In [42]:
#loop through the models
for name, model in models:
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ])

    #perform cross validation
    cv_scores=cross_val_score(pipeline,X_train,y_train,cv=5)

    #mean accuracy
    mean_accuracy=cv_scores.mean()

    #fit the pipeline
    pipeline.fit(X_train,y_train)
    y_pred=pipeline.predict(X_test)

    #accuracy score
    accuracy=accuracy_score(y_test,y_pred)

    #print the results
    print('Model:',name)
    print('CV Mean Accuracy:',mean_accuracy)
    print('Accuracy:',accuracy)
    print()

    if accuracy>best_accuracy:
        best_model=pipeline
        best_accuracy=accuracy

print(f'The best modelsis :',best_model)

Model: Random Forest
CV Mean Accuracy: 0.7991529597163399
Accuracy: 0.8379888268156425

Model: Gradient Boosting
CV Mean Accuracy: 0.8076135132473162
Accuracy: 0.7988826815642458

Model: Support Vector Classifier
CV Mean Accuracy: 0.8160248202501723
Accuracy: 0.8044692737430168

Model: Logistic Regression
CV Mean Accuracy: 0.7977839062346105
Accuracy: 0.8100558659217877

The best modelsis : RandomForestClassifier(random_state=42)
