In [2]:
# Question 2: Feature Engineering & Hyperparameter Tuning on the Titanic Dataset

# Step 1: Load the Titanic dataset (Assume you have a file named titanic.csv ).
# Step 2: Create features and handle missing values.
# Step 3: Train a pipeline using a Random Forest with GridSearchCV.
# Step 4: Evaluate the tuned model with cross-validation.

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import numpy as np

def load_and_process_titanic_data(file_path='titanic.csv', test_size=0.2, random_state=42, cv=5):
    """
    Loads, preprocesses, performs feature engineering, trains a Random Forest model
    with hyperparameter tuning using GridSearchCV, and evaluates the model
    on the Titanic dataset.  Includes comprehensive error handling.

    Args:
        file_path (str): Path to the CSV file containing the Titanic dataset.
        test_size (float): Proportion of the data to use for testing.
        random_state (int): Random seed for reproducibility.
        cv (int): Number of cross-validation folds.

    Returns:
        tuple: A tuple containing:
            - roc_auc (float): Mean ROC AUC score from cross-validation.
            - classification_report_str (str): Classification report on the test set.
            - confusion_matrix_array (np.ndarray): Confusion matrix on the test set.
            - best_model (Pipeline): The best trained pipeline.
    """
    try:
        # Step 1: Load the Titanic dataset
        print(f"Loading Titanic dataset: {file_path}")
        data = pd.read_csv(file_path)
        print("Dataset loaded successfully.")

        # Basic Exploration
        print("\n--- Dataset Information ---")
        data.info()
        print("\n--- First 5 rows of the dataset ---")
        print(data.head())

        # Check for missing values
        if data.isnull().sum().any():
            print("\nMissing values found in the dataset. Handling them with imputation.")
            print(data.isnull().sum())

        # Check for duplicates
        if data.duplicated().any():
            print("Duplicate rows found.  Removing them.")
            data = data.drop_duplicates().reset_index(drop=True)

        # Step 2: Feature Engineering and Data Cleaning
        print("\n--- Feature Engineering and Data Cleaning ---")

        # Function to extract title from Name
        def get_title(name):
            if pd.isna(name):
                return "Unknown"
            title_search = re.search(' ([A-Za-z]+)\.', name)
            if title_search:
                return title_search.group(1)
            return "Unknown"

        # Apply the function to create the Title feature
        data['Title'] = data['Name'].apply(get_title)

        # Grouping titles
        data['Title'] = data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
        data['Title'] = data['Title'].replace(['Ms', 'Mlle'], 'Miss')
        data['Title'] = data['Title'].replace('Mme', 'Mrs')

        # Create FamilySize
        data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

        # Create IsAlone
        data['IsAlone'] = np.where(data['FamilySize'] == 1, 1, 0)

        # Drop unnecessary columns
        data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

        # Separate features and target
        X = data.drop('Survived', axis=1)
        y = data['Survived']

        if X.empty or y.empty:
            raise ValueError("Features (X) or target (y) are empty after feature engineering.")

        # Split data *before* preprocessing
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

        # Define categorical and numerical features *after* the split
        categorical_features = X_train.select_dtypes(include=['object']).columns
        numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns

        # Step 3: Create a pipeline with preprocessing and a Random Forest model
        print("\n--- Creating and training the pipeline ---")
        # Define the preprocessor
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='median')),
                    ('scaler', StandardScaler())
                ]), numerical_features),
                ('cat', Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('onehot', OneHotEncoder(handle_unknown='ignore'))
                ]), categorical_features)
            ])

        # Define the pipeline
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', RandomForestClassifier(random_state=random_state))])

        # Define the parameter grid for GridSearchCV
        param_grid = {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 10, 20],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4]
        }

        # Instantiate GridSearchCV
        grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='roc_auc', verbose=1, error_score='raise')  # Changed error_score

        # Train the model using GridSearchCV
        grid_search.fit(X_train, y_train)

        # Get the best model
        best_model = grid_search.best_estimator_
        print("\nBest parameters found:", grid_search.best_params_)

        # Step 4: Evaluate the tuned model
        print("\n--- Evaluating the tuned model ---")
        # Cross-validation on the *training* data
        cv_scores = cross_val_score(best_model, X_train, y_train, cv=cv, scoring='roc_auc')
        roc_auc = np.mean(cv_scores)  # Mean CV score
        print(f"Mean ROC AUC Score from {cv}-fold cross-validation (training data): {roc_auc:.4f}")

        # Evaluation on the test set
        y_pred = best_model.predict(X_test)
        y_pred_proba = best_model.predict_proba(X_test)[:, 1]
        test_roc_auc = roc_auc_score(y_test, y_pred_proba) # ROC AUC on test
        print(f"ROC AUC Score on the test set: {test_roc_auc:.4f}")

        classification_report_str = classification_report(y_test, y_pred)
        print("\nClassification Report on the test set:")
        print(classification_report_str)

        confusion_matrix_array = confusion_matrix(y_test, y_pred)
        print("\nConfusion Matrix on the test set:")
        print(confusion_matrix_array)

        return roc_auc, classification_report_str, confusion_matrix_array, best_model

    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found. Please check the file path.")
        return None, None, None, None
    except pd.errors.EmptyDataError:
        print(f"Error: The file '{file_path}' is empty.")
        return None, None, None, None
    except pd.errors.ParserError:
        print(f"Error: Failed to parse the file '{file_path}'. Ensure it's a valid CSV format.")
        return None, None, None, None
    except ValueError as ve:
        print(f"ValueError: {ve}")
        return None, None, None, None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None, None, None, None

if __name__ == "__main__":
    roc_auc, classification_report_str, confusion_matrix_array, best_model = load_and_process_titanic_data(file_path='titanic.csv')

    if roc_auc is not None:
        print("\n--- Summary ---")
        print(f"Mean Cross-Validation ROC AUC Score: {roc_auc:.4f}")
        print("\nTest Set Performance:")
        print(classification_report_str)
        print(confusion_matrix_array)

Loading Titanic dataset: titanic.csv
Error: The file 'titanic.csv' was not found. Please check the file path.
