# Selecting best model selecting, fitting, prediction and hyperparameter tunning

In [26]:
# hyper parameter tunning in Pipeline
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [27]:
# load data
titanic = sns.load_dataset('titanic')
# Select feature and target variable
X =  titanic[['pclass','sex','age','fare','embarked']]
y = titanic['survived']
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
# create models 
models = [
    ("Ranndom Forest", RandomForestClassifier(random_state=42)),
    ("Gradient Boosting", GradientBoostingClassifier(random_state=42)),
    ("XG_Boost", XGBClassifier(random_state = 42)),
    ("Logistic Regression", LogisticRegression(random_state=42)),
    ("Support Vector Machine", SVC(random_state=42))
]
best_model = None
best_accuracy = 0.0
# iterate to select the best model
for name , model in models:
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ])
    # perform Cross validation
    scores = cross_val_score(pipeline, X_train, y_train, cv=5)
    # calculate mean accuracy
    mean_accuracy = scores.mean()
    # fit the pipeline
    pipeline.fit(X_train, y_train)
    # Make the prediction on the data
    y_pred = pipeline.predict(X_test)
    # calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    # print the peroformance 
    print("Model:", name)
    print("Cross_validation Accuracy:", mean_accuracy)
    print("Test Accuracy", accuracy)
    print()
    # check 
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline
    print("Best Model:", best_model)

Model: Ranndom Forest
Cross_validation Accuracy: 0.7991529597163399
Test Accuracy 0.8379888268156425

Best Model: Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])
Model: Gradient Boosting
Cross_validation Accuracy: 0.8090121146459175
Test Accuracy 0.7988826815642458

Best Model: Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])
Model: XG_Boost
Cross_validation Accuracy: 0.8034177090515119
Test Accuracy 0.7932960893854749

Best Model: Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])
Model: Logistic Regression
Cross_vali