In [1]:
# Question 1: Load & Explore the Credit Card Fraud Detection Dataset

# Step 1: Load the dataset from a CSV (Assume you have a file named creditcard.csv ).
# Step 2: Split the data.
# Step 3: Train a Logistic Regression model.
# Step 4: Evaluate using ROC AUC score.how 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import numpy as np

def load_and_explore_fraud_data(file_path='creditcard.csv', test_size=0.3, random_state=42):
    """
    Loads, explores, splits, trains a Logistic Regression model, and evaluates
    a credit card fraud detection dataset. Includes comprehensive error handling.

    Args:
        file_path (str): The path to the CSV file containing the dataset.
        test_size (float): The proportion of the dataset to use for testing (0.0 to 1.0).
        random_state (int): Random seed for reproducibility.

    Returns:
        tuple: A tuple containing:
            - roc_auc (float): The ROC AUC score on the test set.
            - classification_report_str (str): The classification report on the test set.
            - confusion_matrix_array (np.ndarray): The confusion matrix on the test set.
            - model (LogisticRegression): The trained Logistic Regression model.
    """
    try:
        # Step 1: Load the dataset
        print(f"Loading dataset from: {file_path}")
        data = pd.read_csv(file_path)
        print("Dataset loaded successfully.")

        # Basic exploration
        print("\n--- Dataset Information ---")
        data.info()
        print("\n--- First 5 rows of the dataset ---")
        print(data.head())
        print("\n--- Class Distribution ---")
        print(data['Class'].value_counts(normalize=True))

        # Check for missing values
        if data.isnull().sum().any():
            print("\nWarning: Missing values found in the dataset. Consider imputation.")
            print(data.isnull().sum())

        # Check for duplicate rows
        if data.duplicated().any():
            print("\nWarning: Duplicate rows found in the dataset. Consider handling them.")
            print(f"Number of duplicate rows: {data.duplicated().sum()}")
            data = data.drop_duplicates().reset_index(drop=True)
            print("Duplicate rows removed.")

        # Step 2: Split the data
        print("\n--- Splitting the data ---")
        X = data.drop('Class', axis=1)
        y = data['Class']

        if X.empty or y.empty:
            raise ValueError("Features (X) or target (y) are empty after dropping 'Class'.")

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
        print(f"Training set size: {len(X_train)}")
        print(f"Testing set size: {len(X_test)}")
        print(f"Class distribution in training set:\n{y_train.value_counts(normalize=True)}")
        print(f"Class distribution in testing set:\n{y_test.value_counts(normalize=True)}")

        # Step 3: Train a Logistic Regression model
        print("\n--- Training Logistic Regression model ---")
        model = LogisticRegression(solver='liblinear', random_state=random_state, class_weight='balanced')
        model.fit(X_train, y_train)
        print("Model trained successfully.")

        # Step 4: Evaluate the model
        print("\n--- Evaluating the model ---")
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        print(f"ROC AUC Score on the test set: {roc_auc:.4f}")

        y_pred = model.predict(X_test)
        classification_report_str = classification_report(y_test, y_pred)
        print("\nClassification Report on the test set:")
        print(classification_report_str)

        confusion_matrix_array = confusion_matrix(y_test, y_pred)
        print("\nConfusion Matrix on the test set:")
        print(confusion_matrix_array)

        return roc_auc, classification_report_str, confusion_matrix_array, model

    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found. Please check the file path.")
        return None, None, None, None
    except pd.errors.EmptyDataError:
        print(f"Error: The file '{file_path}' is empty.")
        return None, None, None, None
    except pd.errors.ParserError:
        print(f"Error: Failed to parse the file '{file_path}'. Ensure it's a valid CSV format.")
        return None, None, None, None
    except ValueError as ve:
        print(f"ValueError: {ve}")
        return None, None, None, None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None, None, None, None

if __name__ == "__main__":
    roc_auc, classification_report_str, confusion_matrix_array, trained_model = load_and_explore_fraud_data(file_path='creditcard.csv')

    if roc_auc is not None:
        print("\n--- Summary ---")
        print(f"Final ROC AUC Score: {roc_auc:.4f}")
        print("\nFinal Classification Report:")
        print(classification_report_str)
        print("\nFinal Confusion Matrix:")
        print(confusion_matrix_array)

Loading dataset from: creditcard.csv
Error: The file 'creditcard.csv' was not found. Please check the file path.
