In [1]:
# Question 1: Load & Explore the Credit Card Fraud Detection Dataset

# Step 1: Load the dataset from a CSV (Assume you have a file named creditcard.csv ).
# Step 2: Split the data.
# Step 3: Train a Logistic Regression model.
# Step 4: Evaluate using ROC AUC score.how 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import logging
import time

# Configure logging for better readability and debugging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def load_and_evaluate_fraud_data(file_path='creditcard.csv', test_size=0.3, random_state=42):
    """
    Loads, splits, trains a Logistic Regression model, and evaluates a credit card
    fraud detection dataset using ROC AUC. Includes improved logging and handles
    potential class imbalance with 'balanced' class weights.

    Args:
        file_path (str): The path to the CSV file containing the dataset.
        test_size (float): The proportion of the dataset to use for testing.
        random_state (int): Random seed for reproducibility.

    Returns:
        tuple: A tuple containing the ROC AUC score and the classification report.
    """
    start_time = time.time()
    try:
        logging.info(f"Step 1: Loading dataset from {file_path}")
        data = pd.read_csv(file_path)
        logging.info(f"Dataset loaded successfully. Shape: {data.shape}")

        logging.info("Step 2: Splitting the data into features (X) and target (y)")
        X = data.drop('Class', axis=1)
        y = data['Class']
        logging.info(f"Features (X) shape: {X.shape}, Target (y) shape: {y.shape}")

        logging.info(f"Splitting data into training and testing sets (test_size={test_size}, random_state={random_state}, stratify=y)")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
        logging.info(f"Training set size: {len(X_train)}, Testing set size: {len(X_test)}")
        logging.info(f"Class distribution in training set:\n{y_train.value_counts(normalize=True)}")
        logging.info(f"Class distribution in testing set:\n{y_test.value_counts(normalize=True)}")

        logging.info("Step 3: Training a Logistic Regression model")
        # Handling class imbalance by setting class_weight='balanced'
        model = LogisticRegression(solver='liblinear', random_state=random_state, class_weight='balanced')
        model.fit(X_train, y_train)
        logging.info("Logistic Regression model trained successfully.")

        logging.info("Step 4: Evaluating the model using ROC AUC score")
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        logging.info(f"ROC AUC Score on the test set: {roc_auc:.4f}")

        y_pred = model.predict(X_test)
        classification_report_str = classification_report(y_test, y_pred)
        logging.info("\nClassification Report on the test set:\n%s", classification_report_str)

        confusion = confusion_matrix(y_test, y_pred)
        logging.info("\nConfusion Matrix on the test set:\n%s", confusion)

        end_time = time.time()
        logging.info(f"Total execution time: {end_time - start_time:.2f} seconds")

        return roc_auc, classification_report_str

    except FileNotFoundError:
        logging.error(f"Error: The file '{file_path}' was not found. Please check the file path.")
        return None, None
    except pd.errors.EmptyDataError:
        logging.error(f"Error: The file '{file_path}' is empty.")
        return None, None
    except pd.errors.ParserError:
        logging.error(f"Error: Failed to parse the file '{file_path}'. Ensure it's a valid CSV format.")
        return None, None
    except ValueError as ve:
        logging.error(f"ValueError: {ve}")
        return None, None
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        return None, None

if __name__ == "__main__":
    roc_auc, classification_report_str = load_and_evaluate_fraud_data(file_path='creditcard.csv')

    if roc_auc is not None:
        print("\n--- Evaluation Summary ---")
        print(f"Final ROC AUC Score: {roc_auc:.4f}")
        print("\nFinal Classification Report:\n{classification_report_str}")

2025-05-07 04:34:29,950 - INFO - Step 1: Loading dataset from creditcard.csv
2025-05-07 04:34:29,951 - ERROR - Error: The file 'creditcard.csv' was not found. Please check the file path.
