In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                            confusion_matrix, classification_report, roc_auc_score,
                            roc_curve, precision_recall_curve, average_precision_score)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')


In [None]:
# Create a sample dataset for demonstration (similar to the Credit Card Fraud Detection dataset)
np.random.seed(42)
n_samples = 10000
n_features = 30

# Create feature columns (V1-V28 plus Time and Amount)
cols = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

# Generate feature data
X = np.random.randn(n_samples, n_features)

# Generate target variable (fraud=1, normal=0) with imbalance (0.2% fraud)
fraud_ratio = 0.002
n_fraud = int(n_samples * fraud_ratio)
y = np.zeros(n_samples)
fraud_indices = np.random.choice(range(n_samples), size=n_fraud, replace=False)
y[fraud_indices] = 1

# Create DataFrame
df = pd.DataFrame(X, columns=cols)
df['Class'] = y

# Make Time and Amount more realistic
df['Time'] = np.random.uniform(0, 172800, n_samples)  # Time in seconds (2 days)
df['Amount'] = np.exp(np.random.normal(3, 1, n_samples))  # Log-normal distribution for amounts

# Display info about the dataset
print(f"Dataset shape: {df.shape}")
print(f"Number of fraudulent transactions: {df['Class'].sum()}")
print(f"Fraud percentage: {df['Class'].mean() * 100:.3f}%")


In [None]:
# Define feature engineering function
def add_features(df):
    """Add engineered features to the dataframe."""
    df = df.copy()
    
    # Create time-based features
    df['Hour'] = df['Time'] // 3600  # Convert seconds to hours
    df['Hour_sin'] = np.sin(2 * np.pi * df['Hour'] / 24)  # Cyclical encoding
    df['Hour_cos'] = np.cos(2 * np.pi * df['Hour'] / 24)  # Cyclical encoding
    
    # Log transform for Amount (common for financial data)
    df['Amount_log'] = np.log1p(df['Amount'])
    
    # Create interaction features between selected V features
    df['V1_V2'] = df['V1'] * df['V2']
    df['V1_V3'] = df['V1'] * df['V3']
    df['V2_V3'] = df['V2'] * df['V3']
    
    return df

# Define a custom transformer for feature engineering
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return add_features(X)


In [None]:
# Split the data
X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(f"Training set: {X_train.shape[0]} samples, {y_train.sum()} frauds ({y_train.mean() * 100:.3f}%)")
print(f"Test set: {X_test.shape[0]} samples, {y_test.sum()} frauds ({y_test.mean() * 100:.3f}%)")

# Define column types
amount_columns = ['Amount']
time_columns = ['Time']
v_columns = [col for col in X.columns if col.startswith('V')]
hour_columns = ['Hour', 'Hour_sin', 'Hour_cos']
amount_derived_columns = ['Amount_log']
interaction_columns = ['V1_V2', 'V1_V3', 'V2_V3']

# Create preprocessing pipeline
preprocessor = Pipeline(steps=[
    ('feature_engineer', FeatureEngineer()),
    ('column_transformer', ColumnTransformer(
        transformers=[
            ('amount', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', RobustScaler())
            ]), ['Amount', 'Amount_log']),
            
            ('time', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), ['Time']),
            
            ('hour', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), ['Hour', 'Hour_sin', 'Hour_cos']),
            
            ('v_features', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), v_columns + ['V1_V2', 'V1_V3', 'V2_V3'])
        ]
    ))
])


In [None]:
# Function to evaluate a model with various metrics
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """
    Evaluate a model on the test set and print various metrics.
    
    Parameters:
    -----------
    model : scikit-learn model
        The trained model to evaluate
    X_train : array-like
        Training features
    X_test : array-like
        Test features
    y_train : array-like
        Training labels
    y_test : array-like
        Test labels
    model_name : str
        Name of the model for reporting
    
    Returns:
    --------
    dict
        Dictionary containing evaluation metrics
    """
    # Make predictions
    y_pred = model.predict(X_test)
    
    # For ROC and precision-recall curves, we need probabilities
    try:
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    except:
        # Some models might not have predict_proba
        y_pred_proba = None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # ROC AUC
    if y_pred_proba is not None:
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        pr_auc = average_precision_score(y_test, y_pred_proba)
    else:
        roc_auc = None
        pr_auc = None
    
    # Print results
    print(f"=== {model_name} Evaluation ===")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    if roc_auc is not None:
        print(f"ROC AUC: {roc_auc:.4f}")
        print(f"PR AUC: {pr_auc:.4f}")
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Visualize confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Normal', 'Fraud'], 
                yticklabels=['Normal', 'Fraud'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.show()
    
    # Plot ROC curve if probabilities are available
    if y_pred_proba is not None:
        plt.figure(figsize=(8, 6))
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.4f})')
        plt.plot([0, 1], [0, 1], 'k--', label='Random')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve - {model_name}')
        plt.legend()
        plt.show()
        
        # Plot Precision-Recall curve
        plt.figure(figsize=(8, 6))
        precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_proba)
        plt.plot(recall_curve, precision_curve, label=f'PR Curve (AUC = {pr_auc:.4f})')
        # Add a line for the baseline (percentage of positive class)
        plt.axhline(y=y_test.mean(), color='r', linestyle='--', label=f'Baseline ({y_test.mean():.4f})')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(f'Precision-Recall Curve - {model_name}')
        plt.legend()
        plt.show()
    
    # Return metrics dictionary for later comparison
    metrics_dict = {
        'model_name': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc
    }
    
    return metrics_dict


In [None]:
# Process the training and test data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Apply SMOTE to the processed training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)

# Print class distribution before and after resampling
print("Class distribution before SMOTE:")
print(f"Class 0 (Normal): {(y_train == 0).sum()} ({(1 - y_train.mean()) * 100:.2f}%)")
print(f"Class 1 (Fraud): {(y_train == 1).sum()} ({y_train.mean() * 100:.2f}%)")

print("\nClass distribution after SMOTE:")
print(f"Class 0 (Normal): {(y_train_resampled == 0).sum()} ({(1 - y_train_resampled.mean()) * 100:.2f}%)")
print(f"Class 1 (Fraud): {(y_train_resampled == 1).sum()} ({y_train_resampled.mean() * 100:.2f}%)")

print(f"\nProcessed training data shape: {X_train_processed.shape}")
print(f"Resampled training data shape: {X_train_resampled.shape}")
print(f"Processed test data shape: {X_test_processed.shape}")


In [None]:
# Train a logistic regression model
lr_model = LogisticRegression(C=1.0, class_weight=None, max_iter=1000, random_state=42)
lr_model.fit(X_train_resampled, y_train_resampled)

# Evaluate the model
lr_metrics = evaluate_model(lr_model, X_train_resampled, X_test_processed, y_train_resampled, y_test, "Logistic Regression")


In [None]:
# Train a decision tree model
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_model.fit(X_train_resampled, y_train_resampled)

# Evaluate the model
dt_metrics = evaluate_model(dt_model, X_train_resampled, X_test_processed, y_train_resampled, y_test, "Decision Tree")


In [None]:
# Train a random forest model
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)

# Evaluate the model
rf_metrics = evaluate_model(rf_model, X_train_resampled, X_test_processed, y_train_resampled, y_test, "Random Forest")


In [None]:
# Train an SVM model
# Note: SVM can be slow to train on large datasets, so we're using a linear kernel for efficiency
svm_model = SVC(kernel='linear', C=1.0, probability=True, random_state=42)
svm_model.fit(X_train_resampled, y_train_resampled)

# Evaluate the model
svm_metrics = evaluate_model(svm_model, X_train_resampled, X_test_processed, y_train_resampled, y_test, "Support Vector Machine")


In [None]:
# Collect all metrics in a single dataframe
all_metrics = pd.DataFrame([lr_metrics, dt_metrics, rf_metrics, svm_metrics])

# Display the metrics table
print("Model Performance Comparison:")
display_metrics = all_metrics[['model_name', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'pr_auc']].set_index('model_name')
display_metrics


In [None]:
# Visualize model comparison - Accuracy, Precision, Recall, F1
metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1']
model_names = all_metrics['model_name']

plt.figure(figsize=(12, 6))
bar_width = 0.2
index = np.arange(len(model_names))

for i, metric in enumerate(metrics_to_plot):
    plt.bar(index + i*bar_width, all_metrics[metric], bar_width, label=metric.capitalize())

plt.xlabel('Model')
plt.ylabel('Score')
plt.title('Model Performance Comparison')
plt.xticks(index + bar_width * (len(metrics_to_plot) - 1) / 2, model_names)
plt.legend()
plt.ylim(0, 1.0)
plt.tight_layout()
plt.show()

# Visualize ROC AUC and PR AUC
metrics_to_plot = ['roc_auc', 'pr_auc']

plt.figure(figsize=(12, 6))
bar_width = 0.35
index = np.arange(len(model_names))

for i, metric in enumerate(metrics_to_plot):
    plt.bar(index + i*bar_width, all_metrics[metric], bar_width, 
            label='ROC AUC' if metric == 'roc_auc' else 'PR AUC')

plt.xlabel('Model')
plt.ylabel('Score')
plt.title('ROC AUC and PR AUC Comparison')
plt.xticks(index + bar_width / 2, model_names)
plt.legend()
plt.ylim(0, 1.0)
plt.tight_layout()
plt.show()


In [None]:
# Get feature names after preprocessing
# This is a bit tricky because of our preprocessing pipeline
# We'll create a simplified list of feature names
feature_names = []
feature_names.extend(['Amount', 'Amount_log'])  # Amount features
feature_names.extend(['Time'])  # Time feature
feature_names.extend(['Hour', 'Hour_sin', 'Hour_cos'])  # Hour features
feature_names.extend(v_columns)  # V features
feature_names.extend(['V1_V2', 'V1_V3', 'V2_V3'])  # Interaction features

# Extract feature importance from Random Forest (most interpretable from our models)
if hasattr(rf_model, 'feature_importances_'):
    importances = rf_model.feature_importances_
    
    # Create a DataFrame for better visualization
    importance_df = pd.DataFrame({
        'Feature': feature_names[:len(importances)],  # Make sure lengths match
        'Importance': importances
    })
    
    # Sort by importance
    importance_df = importance_df.sort_values('Importance', ascending=False)
    
    # Plot top 15 features
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(15))
    plt.title('Top 15 Features by Importance (Random Forest)')
    plt.tight_layout()
    plt.show()
    
    # Print top 15 features
    print("Top 15 most important features:")
    print(importance_df.head(15))


In [None]:
# For logistic regression, we can look at the coefficients
if hasattr(lr_model, 'coef_'):
    coef = lr_model.coef_[0]
    
    # Create a DataFrame for better visualization
    coef_df = pd.DataFrame({
        'Feature': feature_names[:len(coef)],  # Make sure lengths match
        'Coefficient': coef
    })
    
    # Sort by absolute coefficient value
    coef_df['Abs_Coefficient'] = np.abs(coef_df['Coefficient'])
    coef_df = coef_df.sort_values('Abs_Coefficient', ascending=False)
    
    # Plot top 15 features
    plt.figure(figsize=(12, 8))
    plt.barh(y=coef_df.head(15)['Feature'], width=coef_df.head(15)['Coefficient'], color=[
        'red' if x < 0 else 'green' for x in coef_df.head(15)['Coefficient']
    ])
    plt.title('Top 15 Features by Coefficient Magnitude (Logistic Regression)')
    plt.xlabel('Coefficient Value')
    plt.ylabel('Feature')
    plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Print top 15 features
    print("Top 15 features by coefficient magnitude:")
    print(coef_df[['Feature', 'Coefficient']].head(15))
