In [3]:
# Question 2: Feature Engineering & Hyperparameter Tuning on the Titanic Dataset

# Step 1: Load the Titanic dataset (Assume you have a file named titanic.csv ).
# Step 2: Create features and handle missing values.
# Step 3: Train a pipeline using a Random Forest with GridSearchCV.
# Step 4: Evaluate the tuned model with cross-validation.

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import numpy as np
import logging
import re  # Import the regular expression module
import time

# Configure logging for better readability and debugging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def load_data(file_path='titanic.csv'):
    """
    Loads the Titanic dataset from a CSV file.

    Args:
        file_path (str): Path to the CSV file.

    Returns:
        pd.DataFrame: The loaded dataset.
    """
    try:
        logging.info(f"Loading dataset from: {file_path}")
        data = pd.read_csv(file_path)
        logging.info("Dataset loaded successfully.")
        return data
    except FileNotFoundError:
        logging.error(f"Error: The file '{file_path}' was not found. Please check the file path.")
        raise
    except pd.errors.EmptyDataError:
        logging.error(f"Error: The file '{file_path}' is empty.")
        raise
    except pd.errors.ParserError:
        logging.error(f"Error: Failed to parse the file '{file_path}'. Ensure it's a valid CSV format.")
        raise
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        raise

def explore_data(data):
    """
    Explores the dataset by printing information, head, and basic statistics.

    Args:
        data (pd.DataFrame): The input dataset.
    """
    logging.info("\n--- Dataset Exploration ---")
    logging.info("Dataset Information:")
    data.info()
    logging.info("\nFirst 5 rows of the dataset:")
    logging.info(data.head())
    logging.info("\nBasic statistics of the dataset:")
    logging.info(data.describe())

    # Check for missing values
    logging.info("\nMissing values per column:")
    logging.info(data.isnull().sum())

    # Check for duplicate rows
    logging.info(f"\nNumber of duplicate rows: {data.duplicated().sum()}")

    # Class distribution for the target variable if it exists
    if 'Survived' in data.columns:
        logging.info("\nClass distribution for 'Survived':")
        logging.info(data['Survived'].value_counts(normalize=True))

def feature_engineering(data):
    """
    Performs feature engineering on the Titanic dataset.

    Args:
        data (pd.DataFrame): The input dataset.

    Returns:
        pd.DataFrame: The transformed dataset.
    """
    logging.info("\n--- Feature Engineering ---")

    # Extract title from Name
    def get_title(name):
        if pd.isna(name):
            return "Unknown"
        title_search = re.search(' ([A-Za-z]+)\.', name)
        if title_search:
            return title_search.group(1)
        return "Unknown"

    data['Title'] = data['Name'].apply(get_title)
    logging.info("Extracted titles from 'Name' column.")

    # Grouping titles
    data['Title'] = data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    data['Title'] = data['Title'].replace(['Ms', 'Mlle'], 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')
    logging.info("Grouped titles.")

    # Create FamilySize
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    logging.info("Created 'FamilySize' feature.")

    # Create IsAlone
    data['IsAlone'] = np.where(data['FamilySize'] == 1, 1, 0)
    logging.info("Created 'IsAlone' feature.")

    # Drop unnecessary columns
    data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    logging.info("Dropped unnecessary columns: 'PassengerId', 'Name', 'Ticket', 'Cabin'.")
    return data

def preprocess_data(X_train, X_test):
    """
    Preprocesses the training and testing data using ColumnTransformer.

    Args:
        X_train (pd.DataFrame): Training features.
        X_test (pd.DataFrame): Testing features.

    Returns:
        tuple: Preprocessed X_train and X_test as numpy arrays, and lists of
               numerical and categorical feature names.
    """
    logging.info("\n--- Data Preprocessing ---")
    numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X_train.select_dtypes(include=['object']).columns

    logging.info(f"Numerical features: {numerical_features}")
    logging.info(f"Categorical features: {categorical_features}")

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), numerical_features),
            ('cat', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))
            ]), categorical_features)
        ])

    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    logging.info("Data preprocessing completed.")
    return X_train_processed, X_test_processed, numerical_features, categorical_features

def train_model(X_train, y_train, param_grid, cv=5, random_state=42):
    """
    Trains a Random Forest model with hyperparameter tuning using GridSearchCV.

    Args:
        X_train (np.ndarray): Training features.
        y_train (pd.Series): Training target.
        param_grid (dict): Parameter grid for GridSearchCV.
        cv (int): Number of cross-validation folds.
        random_state (int): Random seed.

    Returns:
        GridSearchCV: The trained GridSearchCV object.
    """
    logging.info("\n--- Model Training ---")
    pipeline = Pipeline(steps=[('classifier', RandomForestClassifier(random_state=random_state))])

    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='roc_auc', verbose=1, error_score='raise')
    grid_search.fit(X_train, y_train)
    logging.info("Model training completed.")
    return grid_search

def evaluate_model(grid_search, X_test, y_test, cv=5):
    """
    Evaluates the trained model using cross-validation and on the test set.

    Args:
        grid_search (GridSearchCV): The trained GridSearchCV object.
        X_test (np.ndarray): Testing features.
        y_test (pd.Series): Testing target.
        cv (int): Number of cross-validation folds.

    Returns:
        tuple: Mean ROC AUC score from cross-validation, classification report,
               confusion matrix, and the best trained model.
    """
    logging.info("\n--- Model Evaluation ---")
    best_model = grid_search.best_estimator_
    logging.info(f"Best parameters found: {grid_search.best_params_}")

    cv_scores = cross_val_score(best_model, X_test, y_test, cv=cv, scoring='roc_auc') # Changed to use X_test
    mean_cv_roc_auc = np.mean(cv_scores)
    logging.info(f"Mean ROC AUC Score from {cv}-fold cross-validation (test set): {mean_cv_roc_auc:.4f}") # Changed to test set

    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    test_roc_auc = roc_auc_score(y_test, y_pred_proba)
    logging.info(f"ROC AUC Score on the test set: {test_roc_auc:.4f}")

    classification_report_str = classification_report(y_test, y_pred)
    logging.info("\nClassification Report on the test set:\n%s", classification_report_str)

    confusion_matrix_array = confusion_matrix(y_test, y_pred)
    logging.info("\nConfusion Matrix on the test set:\n%s", confusion_matrix_array)

    return mean_cv_roc_auc, classification_report_str, confusion_matrix_array, best_model

def main(file_path='titanic.csv', test_size=0.2, random_state=42, cv=5):
    """
    Main function to load, preprocess, train, and evaluate the Titanic dataset.

    Args:
        file_path (str): Path to the CSV file.
        test_size (float): Proportion of the data to use for testing.
        random_state (int): Random seed.
        cv (int): Number of cross-validation folds.
    """
    start_time = time.time()
    try:
        data = load_data(file_path)
        explore_data(data)  # Explore the data
        data = feature_engineering(data)

        # Separate features and target *after* feature engineering
        X = data.drop('Survived', axis=1)
        y = data['Survived']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
        X_train_processed, X_test_processed, _, _ = preprocess_data(X_train, X_test)

        param_grid = {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 10, 20],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4]
        }

        grid_search = train_model(X_train_processed, y_train, param_grid, cv, random_state)
        mean_cv_roc_auc, classification_report_str, confusion_matrix_array, best_model = evaluate_model(grid_search, X_test_processed, y_test, cv)

        print("\n--- Summary ---")
        print(f"Mean Cross-Validation ROC AUC Score (Test Set): {mean_cv_roc_auc:.4f}") # Changed to test set.
        print("\nTest Set Performance:")
        print(classification_report_str)
        print(confusion_matrix_array)
        print("\nBest Model:\n", best_model)

    except Exception:  # Catch any exception that occurs in the previous steps
        logging.error("An error occurred during the process. Please check the logs for details.")
    finally:
        end_time = time.time()
        logging.info(f"Total execution time: {end_time - start_time:.2f} seconds")

if __name__ == "__main__":
    main(file_path='titanic.csv')


2025-05-07 04:41:37,150 - INFO - Loading dataset from: titanic.csv
2025-05-07 04:41:37,152 - ERROR - Error: The file 'titanic.csv' was not found. Please check the file path.
2025-05-07 04:41:37,152 - ERROR - An error occurred during the process. Please check the logs for details.
2025-05-07 04:41:37,153 - INFO - Total execution time: 0.00 seconds
