In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from scipy.stats import loguniform
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [None]:
import kagglehub
path = kagglehub.dataset_download("yasserh/titanic-dataset")
print(f"Dataset downloaded to: {path}")

In [None]:
import os
import shutil

files = os.listdir(path)
print(f"Files in the dataset directory: {files}")

csv_file = [file for file in files if file.endswith('.csv')][0]

DATA_DIR = '../data'

os.makedirs(DATA_DIR, exist_ok=True)

source_csv_path = os.path.join(path, csv_file)
print(f"Source CSV file path: {source_csv_path}\n")

target_csv_path = os.path.join(DATA_DIR, csv_file)
print(f"CSV file path: {target_csv_path}")

shutil.copy(source_csv_path, target_csv_path)

In [None]:
import pandas as pd
data = pd.read_csv(target_csv_path)
print(data.head())
print(data.info())
print(data.isnull().sum())

In [None]:
# Feature Engineering

def add_features(df):
    df = df.copy() # To avoid modifying the original dataframe
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    return df

drop_features = ['PassengerId', 'Name', 'Ticket', 'Cabin','SibSp', 'Parch'] # Features to drop

In [None]:
feature_eng = Pipeline(steps=[
    ('add_features', FunctionTransformer(add_features)),
    ('drop_features', FunctionTransformer(lambda X: X.drop(columns=drop_features)))
]) 

In [None]:
# Defining feature columns for preprocessing
numeric_features = ['Age', 'Fare', 'FamilySize'] # Numerical features
categorical_features = ['Pclass', 'Sex', 'Embarked', 'IsAlone'] # Categorical features

# Creating preprocessing pipelines for numerical and categorical data
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])




In [None]:
# Combining both pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features),
],
    remainder='passthrough'  # To keep the newly created features
)



In [None]:
# Creating the final pipeline with preprocessing and classifier
pipeline = Pipeline(steps=[
    ('feature_eng', feature_eng),
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000)),    
])

In [None]:
param_distributions = {
    'classifier__C': loguniform(1e-3,1e3), # parameter chaining for logistic regression
    'classifier__penalty': ['l1','l2'],
    'classifier__solver': ['liblinear', 'saga']
}

# Cross-validation setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Setting up RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=50,
    cv=skf,
    verbose=1,
    n_jobs=-1,
    scoring='accuracy',
    random_state=42
)

In [None]:
# Fitting the model
X = data.drop('Survived', axis=1)
y = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Measuring time taken for Randomized Search
start_time = time.time()
random_search.fit(X_train, y_train)
end_time = time.time()

print(f"Randomized Search took {end_time - start_time:.2f} seconds")



In [None]:
# Evaluating the model

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Best Hyperparameters:", random_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



In [None]:
# Confusion Matrix Visualization
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Reds')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix of Logistic Regression Classifier')
plt.show()

In [None]:
# Function to get feature names after ColumnTransformer
def get_feature_names(column_transformer):
    output_features = []
    for name, transformer, columns in column_transformer.transformers_:

        if name == "remainder":
            continue
        
        if isinstance(transformer, Pipeline):
            transformer = transformer.steps[-1][1]  # Last step

        if hasattr(transformer, "get_feature_names_out"):
            names = transformer.get_feature_names_out(columns)
        else:
            names = columns

        output_features.extend(names)

    return output_features

# Getting feature names after preprocessing
feature_names = get_feature_names(best_model.named_steps['preprocessor'])

# Displaying feature importances

def plot_feature_importances(model, feature_names):
    if hasattr(model.named_steps['classifier'], 'coef_'):
        importances = model.named_steps['classifier'].coef_[0]
        indices = np.argsort(importances)[::-1]

        plt.figure(figsize=(12,6))
        plt.title("Feature Importances")
        plt.bar(range(len(importances)), importances[indices])
        plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=90)
        plt.tight_layout()
        plt.show()
    else:
        print("The classifier does not have feature importances.")

plot_feature_importances(best_model, feature_names)