# selecting best model in pipeline

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


In [2]:
titanic_data=sns.load_dataset('titanic')

In [3]:
X=titanic_data[['pclass','sex','age','fare','embarked']]
y=titanic_data['survived']

In [4]:
# Split the data into train and test sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [5]:
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [6]:
# Create a list of models to evaluate
models=[
    ('XGboostclassifier',XGBClassifier(random_state=42)),
    ('GradientBoostingClassifier',GradientBoostingClassifier(random_state=42)),
    ('RandomForestClassifier',RandomForestClassifier(random_state=42))
]

In [7]:
best_model=None
best_score=0

In [8]:
# Iterate over the models and evaluate their performance
for name,model in models:
    #create a pipeline for each model
    pipeline=Pipeline([
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('encoder',OneHotEncoder(handle_unknown='ignore')),
        ('model',model)
    ])

In [9]:
# perform a cross-validation score
score=cross_val_score(pipeline,X_train,y_train,cv=5)

In [10]:
# calculate the mean accuracy
mean_accuracy=score.mean()*100

In [11]:
# Fit the pipeline on the training data
pipeline.fit(X_train,y_train)

In [12]:
#make predictions
y_pred=pipeline.predict(X_test)

In [13]:
accuracy=accuracy_score(y_test,y_pred)*100

In [14]:
# Print the performance metrics
print('model:',name)
print('accuracy:',accuracy)
print('cross validation score:',mean_accuracy)


model: RandomForestClassifier
accuracy: 83.79888268156425
cross validation score: 79.915295971634


In [23]:
# chexk if the current model has best accuracy
if mean_accuracy > best_score:
        best_accuracy=mean_accuracy    
        best_model=pipeline


In [24]:
print('best_model:',best_model)

best_model: RandomForestClassifier


# Add more models in the same code

In [25]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [27]:
# import data from titanic
df=sns.load_dataset('titanic')

In [None]:
# make numeric variable and categorical variables
num_variables=['age','fare']
cat_variables=['pclass','sex','embarked']

In [29]:
# split the data into X and y
X=df[['sex','age','fare','pclass','embarked']]
y=df['survived']

In [34]:
# create a list of models
model=[
    ('RandomForestClassifier',RandomForestClassifier(random_state=42)),
    ('GradientBoostingClassifier',GradientBoostingClassifier(random_state=42)),
    ('SVC',SVC(random_state=42)),
    ('LogisticRegression',LogisticRegression(random_state=42)),
    ('XGboostclassifier',XGBClassifier(random_state=42))
    
]

In [39]:
# Iterate over the models and evaluate their performance
for name, model in models:
# create a pipeline
    pipeline=Pipeline([
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('encoder',OneHotEncoder(handle_unknown='ignore')),
        ('model',model)
])

In [40]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [41]:
best_model = None
best_accuracy = 0.0

In [42]:
# perform the cross validation score
scores=cross_val_score(pipeline,X_train,y_train,cv=5)


In [None]:
# calcuale the mean accuracy
mean_accuracy=scores.mean()*100


In [43]:
# Fit the pipeline on the training data
pipeline.fit(X_train,y_train)
# make predictions
y_pred=pipeline.predict(X_test)
# calculate the accuracy
accuracy=accuracy_score(y_test,y_pred)*100

In [46]:
# check if the current model has the best accuracy
if mean_accuracy > best_accuracy:
    best_accuracy = mean_accuracy
    best_model = pipeline

In [47]:
# print the performance metrics
print('model:',name)
print('accuracy:',accuracy)
print('cross validation score:',mean_accuracy)

model: RandomForestClassifier
accuracy: 81.00558659217877
cross validation score: 79.915295971634


In [48]:
 # Check if the current model has the best accuracy
if accuracy > best_accuracy:
    best_accuracy = accuracy
    best_model = pipeline

# Retrieve the best model
print("Best Model:", best_model)

Best Model: Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])
