# Best Model Selection

In [6]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score


#load the data
df=sns.load_dataset('titanic')

#select feature and target varaibles
X=df[['pclass','sex','fare','embarked']]
y=df['survived']

#splitb the data into train and test sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

#create the list list of models to evaluate 
models=[
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('XG Boosting', XGBClassifier(random_state=42))
]

best_model=None
best_accuracy=0.0

#iterate over the model and evaluate their performance
for name, model in models:
    #create thr pipeline for each model
    pipeline=Pipeline([
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model',model)
    ]) 

    #perform cross validation
    scores=cross_val_score(pipeline,X_train,y_train,cv=5)

    #calculate the mean accuracy
    mean_accuracy=scores.mean()

    #fit the pipeline on the training data
    pipeline.fit(X_train,y_train)

    #make predictions on the test data
    y_pred=pipeline.predict(X_test)

    #calculate the accuracy score
    accuracy=accuracy_score(y_test,y_pred)

    #print the performance metrics
    print('model :',name)
    print('cross-validation accuracy :', mean_accuracy)
    print('test accuraccy :',  accuracy)

    #check if the current model has the best accuracy

    if accuracy > best_accuracy:
        best_accuracy=accuracy
        best_model=pipeline

    #retrieve the best model
    print('best model', best_model)




model : Random Forest
cross-validation accuracy : 0.8132571653698415
test accuraccy : 0.8379888268156425
best model Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])
model : Gradient Boosting
cross-validation accuracy : 0.8090022653402935
test accuraccy : 0.8156424581005587
best model Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])
model : XG Boosting
cross-validation accuracy : 0.8019797104304146
test accuraccy : 0.7988826815642458
best model Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])


we intialize the best_model and best_accuracy variables to track the best performing model

During  the iteration over the models, after calculating the accuracy score for each model, we compare it with the current best_accuracy value.if the current model has a higher accuracy  and assign the pipeline object to best_model.


After the loop, we print the best model using print('best model', best_model).

By comparing the accuracy scores of different models within the  pipeline and  selecting the one with the highest accuracy, yous can retreive the best-performance model for thr given dataset

# Add more models in the same code

In [7]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score


#load the data
df=sns.load_dataset('titanic')

#select feature and target varaibles
X=df[['pclass','sex','fare','embarked']]
y=df['survived']

#splitb the data into train and test sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

#create the list list of models to evaluate 
models=[
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('support vector machine',SVC(random_state=42)),
    ('logistic regression',LogisticRegression(random_state=42))
]

best_model=None
best_accuracy=0.0

#iterate over the model and evaluate their performance
for name, model in models:
    #create thr pipeline for each model
    pipeline=Pipeline([
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model',model)
    ]) 

    #perform cross validation
    scores=cross_val_score(pipeline,X_train,y_train,cv=5)

    #calculate the mean accuracy
    mean_accuracy=scores.mean()

    #fit the pipeline on the training data
    pipeline.fit(X_train,y_train)

    #make predictions on the test data
    y_pred=pipeline.predict(X_test)

    #calculate the accuracy score
    accuracy=accuracy_score(y_test,y_pred)

    #print the performance metrics
    print('model :',name)
    print('cross-validation accuracy :', mean_accuracy)
    print('test accuraccy :',  accuracy)

    #check if the current model has the best accuracy

    if accuracy > best_accuracy:
        best_accuracy=accuracy
        best_model=pipeline

    #retrieve the best model
    print('best model', best_model)




model : Random Forest
cross-validation accuracy : 0.8132571653698415
test accuraccy : 0.8379888268156425
best model Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])
model : Gradient Boosting
cross-validation accuracy : 0.8090022653402935
test accuraccy : 0.8156424581005587
best model Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])
model : support vector machine
cross-validation accuracy : 0.8188220230473752
test accuraccy : 0.7988826815642458
best model Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])
model : logisti