✅ It Does the Following:
Loads the Titanic dataset.

Selects relevant features (Pclass, Sex, Age, etc.) and handles missing values.

Splits into training and testing sets.

Applies preprocessing (imputation, scaling, encoding) with ColumnTransformer.

Trains 7 different classification models (LogReg, k-NN, SVM, etc.).

Handles Naive Bayes' dense array requirement correctly.

Evaluates using Recall, F1-score, and ROC-AUC.

Prints a clean performance summary for each model.

In [66]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, f1_score, roc_auc_score

# Load Titanic dataset
titanic = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")

# Select features and target
features = titanic[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
target = titanic['Survived']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Identify numeric and categorical features
numeric_features = features.select_dtypes(include=['int64', 'float64']).columns
categorical_features = features.select_dtypes(include=['object']).columns

# Pipelines for preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Models to evaluate
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "k-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Naive Bayes": GaussianNB()
}

# Evaluation function
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    if hasattr(model, "predict_proba"):
        roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    else:
        roc_auc = "N/A"
    return recall, f1, roc_auc

# Train, predict and evaluate each model
for name, model in models.items():
    if name == "Naive Bayes":
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('to_dense', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)),
            ('model', model)
        ])
    else:
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
    
    pipeline.fit(X_train, y_train)
    recall, f1, roc_auc = evaluate_model(pipeline, X_test, y_test)
    
    print(f"🔍 {name} Performance:")
    print(f"Recall: {recall:.2f}")
    print(f"F1-score: {f1:.2f}")
    print(f"ROC-AUC: {roc_auc if roc_auc == 'N/A' else round(roc_auc, 2)}\n")


🔍 Logistic Regression Performance:
Recall: 0.74
F1-score: 0.76
ROC-AUC: 0.88

🔍 k-Nearest Neighbors Performance:
Recall: 0.72
F1-score: 0.75
ROC-AUC: 0.86

🔍 Support Vector Machine Performance:
Recall: 0.73
F1-score: 0.77
ROC-AUC: 0.84

🔍 Decision Tree Performance:
Recall: 0.73
F1-score: 0.75
ROC-AUC: 0.79

🔍 Random Forest Performance:
Recall: 0.73
F1-score: 0.76
ROC-AUC: 0.89

🔍 Gradient Boosting Performance:
Recall: 0.70
F1-score: 0.75
ROC-AUC: 0.88



AttributeError: 'numpy.ndarray' object has no attribute 'toarray'