In [7]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# load dataset
df = sns.load_dataset('titanic')
df.drop('deck', axis=1, inplace=True)

# impute missing values in age using knn imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
df['age'] = imputer.fit_transform(df[['age']])

# impute missing values in embarked and embark town using simple imputer
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
df[['embarked', 'embark_town']] = imputer.fit_transform(df[['embarked', 'embark_town']])


# label encoding for categorical columns (since it's a simple encoding and easy to implement)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

for col in df.columns:
    df[col] = label_encoder.fit_transform(df[col])

# split data into X and y
X = df.drop(['survived'], axis=1)
y = df['survived']

# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data for Logistic Regression and SVM
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a list of models to evaluate
models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('Support Vector Machine', SVC(random_state=42)),
    ('Logistic Regression', LogisticRegression(max_iter=500, random_state=42))  # Increased max_iter
]

best_model = None
best_accuracy = 0.0

# Iterate over the models and evaluate their performance
for name, model in models:
    # For SVM and Logistic Regression, use scaled data
    if name in ['Support Vector Machine', 'Logistic Regression']:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    # Perform cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=5)

    # Calculate mean accuracy
    mean_accuracy = scores.mean()
    
    # Calculate accuracy score
    accuracy = accuracy_score(y_test, y_pred)
    
    # Print the performance metrics
    print("Model:", name)
    print("Cross-validation Accuracy:", mean_accuracy)
    print("Test Accuracy:", accuracy)
    print()
    
    # Check if the current model has the best accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

# Retrieve the best model
print("Best Model:", best_model)


Model: Random Forest
Cross-validation Accuracy: 1.0
Test Accuracy: 1.0

Model: Gradient Boosting
Cross-validation Accuracy: 1.0
Test Accuracy: 1.0

Model: Support Vector Machine
Cross-validation Accuracy: 0.6698808234019501
Test Accuracy: 0.994413407821229

Model: Logistic Regression
Cross-validation Accuracy: 1.0
Test Accuracy: 1.0

Best Model: RandomForestClassifier(random_state=42)
