In [1]:
# Cell 1: Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')

In [2]:
# Cell 2: Load Dataset
print("Loading credit card fraud dataset...")
# Load the dataset from OpenML
# Note: This is the first load and might take some time
cc_fraud = fetch_openml(name='credit-card-fraud', version=1, as_frame=True)

# Extract features and target
X = cc_fraud.data
y = cc_fraud.target.astype(int)

print(f"Dataset shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")
print(f"Target distribution:")
print(y.value_counts())
print(f"Fraud percentage: {y.mean():.4%}")

Loading credit card fraud dataset...


OpenMLError: Dataset credit-card-fraud with version 1 not found.

In [None]:
# Cell 3: Data Exploration
# Create a DataFrame for easier exploration
df = pd.DataFrame(X)
df['Class'] = y

# Display basic statistics
print("Basic statistics of the dataset:")
display(df.describe())

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values per column:")
print(missing_values[missing_values > 0] if any(missing_values > 0) else "No missing values found")

# Visualize class distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='Class', data=df)
plt.title('Credit Card Transactions - Class Distribution')
plt.yscale('log')  # Using log scale for better visualization
plt.xlabel('Class (0: Normal, 1: Fraud)')
plt.ylabel('Count (log scale)')
plt.grid(True, alpha=0.3)
plt.show()

# Add count labels on top of bars
plt.figure(figsize=(10, 6))
ax = sns.countplot(x='Class', data=df)
plt.title('Credit Card Transactions - Class Distribution')
plt.xlabel('Class (0: Normal, 1: Fraud)')
plt.ylabel('Count')

# Add count and percentage labels
total = len(df['Class'])
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2.,
            height + 0.1,
            f'{height}\n({height/total:.4%})',
            ha="center", fontsize=12)
plt.show()

In [None]:
# Cell 4: Data Visualization
# Select a subset of features for visualization
selected_features = ['V1', 'V2', 'V3', 'V4', 'V5', 'Amount']

# Pairplot for selected features
plt.figure(figsize=(12, 10))
sns.pairplot(df[selected_features + ['Class']], hue='Class', diag_kind='hist')
plt.suptitle('Pairplot of Selected Features', y=1.02, fontsize=16)
plt.show()

# Distribution of Amount by class
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(df[df['Class'] == 0]['Amount'], bins=50, kde=True, color='blue')
plt.title('Amount Distribution - Normal Transactions')
plt.xlabel('Amount')
plt.xlim([0, 500])  # Focus on common transaction amounts

plt.subplot(1, 2, 2)
sns.histplot(df[df['Class'] == 1]['Amount'], bins=50, kde=True, color='red')
plt.title('Amount Distribution - Fraudulent Transactions')
plt.xlabel('Amount')
plt.xlim([0, 500])

plt.tight_layout()
plt.show()

# Correlation matrix for selected features
plt.figure(figsize=(12, 10))
correlation_matrix = df[selected_features + ['Class']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Selected Features', fontsize=16)
plt.show()

In [None]:
# Cell 5: Data Preprocessing
# Split into features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print("Data split summary:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")
print(f"Fraud ratio in training set: {y_train.mean():.4%}")
print(f"Fraud ratio in testing set: {y_test.mean():.4%}")

In [None]:
# Cell 6: Handle Class Imbalance
# Let's explore different approaches to handle class imbalance

# 1. Original imbalanced data
X_train_orig = X_train
y_train_orig = y_train

# 2. Random Undersampling
undersampler = RandomUnderSampler(sampling_strategy=0.1, random_state=42)  # Keep 10% of majority class
X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)

# 3. SMOTE Oversampling
smote = SMOTE(sampling_strategy=0.1, random_state=42)  # Create synthetic samples until minority class is 10% of majority
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Display class distribution for each approach
print("Class distribution in different sampling approaches:")
print(f"Original - Fraud ratio: {y_train_orig.mean():.4%}, Total samples: {len(y_train_orig)}")
print(f"Undersampling - Fraud ratio: {y_train_under.mean():.4%}, Total samples: {len(y_train_under)}")
print(f"SMOTE - Fraud ratio: {y_train_smote.mean():.4%}, Total samples: {len(y_train_smote)}")

# Visualize class distribution after sampling
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.countplot(x=y_train_orig)
plt.title('Original Data')
plt.yscale('log')
plt.xlabel('Class')

plt.subplot(1, 3, 2)
sns.countplot(x=y_train_under)
plt.title('After Undersampling')
plt.xlabel('Class')

plt.subplot(1, 3, 3)
sns.countplot(x=y_train_smote)
plt.title('After SMOTE')
plt.xlabel('Class')

plt.tight_layout()
plt.show()

In [None]:
# Cell 7: Train Logistic Regression Models
# We'll train multiple models with different approaches to imbalanced data

# Original data model
model_orig = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
model_orig.fit(X_train_orig, y_train_orig)

# Undersampling model
model_under = LogisticRegression(max_iter=1000, random_state=42)
model_under.fit(X_train_under, y_train_under)

# SMOTE model
model_smote = LogisticRegression(max_iter=1000, random_state=42)
model_smote.fit(X_train_smote, y_train_smote)

print("Models trained successfully!")

# Check cross-validation scores
print("\nCross-validation results (F1 Score):")
cv_scores_orig = cross_val_score(model_orig, X_train_orig, y_train_orig, cv=5, scoring='f1')
cv_scores_under = cross_val_score(model_under, X_train_under, y_train_under, cv=5, scoring='f1')
cv_scores_smote = cross_val_score(model_smote, X_train_smote, y_train_smote, cv=5, scoring='f1')

print(f"Original data model: {cv_scores_orig.mean():.4f} (±{cv_scores_orig.std():.4f})")
print(f"Undersampling model: {cv_scores_under.mean():.4f} (±{cv_scores_under.std():.4f})")
print(f"SMOTE model: {cv_scores_smote.mean():.4f} (±{cv_scores_smote.std():.4f})")

In [None]:
# Cell 8: Model Evaluation
# Make predictions
y_pred_orig = model_orig.predict(X_test)
y_pred_under = model_under.predict(X_test)
y_pred_smote = model_smote.predict(X_test)

# Get probabilities for ROC curve
y_prob_orig = model_orig.predict_proba(X_test)[:, 1]
y_prob_under = model_under.predict_proba(X_test)[:, 1]
y_prob_smote = model_smote.predict_proba(X_test)[:, 1]

# Create a function to display evaluation metrics
def evaluate_model(y_true, y_pred, y_prob, model_name):
    # Basic metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print(f"=== {model_name} ===")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))
    
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Normal', 'Fraud'],
                yticklabels=['Normal', 'Fraud'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.show()
    
    return accuracy, precision, recall, f1

# Evaluate all models
print("\nEvaluation Metrics:")
metrics_orig = evaluate_model(y_test, y_pred_orig, y_prob_orig, "Original Model (with class_weight='balanced')")
metrics_under = evaluate_model(y_test, y_pred_under, y_prob_under, "Undersampling Model")
metrics_smote = evaluate_model(y_test, y_pred_smote, y_prob_smote, "SMOTE Model")

In [None]:
# Cell 9: ROC and Precision-Recall Curves
# ROC Curve
plt.figure(figsize=(12, 5))

# ROC curve subplot
plt.subplot(1, 2, 1)
# Original model
fpr_orig, tpr_orig, _ = roc_curve(y_test, y_prob_orig)
roc_auc_orig = auc(fpr_orig, tpr_orig)
plt.plot(fpr_orig, tpr_orig, label=f'Original (AUC = {roc_auc_orig:.4f})')

# Undersampling model
fpr_under, tpr_under, _ = roc_curve(y_test, y_prob_under)
roc_auc_under = auc(fpr_under, tpr_under)
plt.plot(fpr_under, tpr_under, label=f'Undersampling (AUC = {roc_auc_under:.4f})')

# SMOTE model
fpr_smote, tpr_smote, _ = roc_curve(y_test, y_prob_smote)
roc_auc_smote = auc(fpr_smote, tpr_smote)
plt.plot(fpr_smote, tpr_smote, label=f'SMOTE (AUC = {roc_auc_smote:.4f})')

# Add diagonal line
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.grid(alpha=0.3)

# Precision-Recall curve subplot
plt.subplot(1, 2, 2)
# Original model
precision_orig, recall_orig, _ = precision_recall_curve(y_test, y_prob_orig)
pr_auc_orig = auc(recall_orig, precision_orig)
plt.plot(recall_orig, precision_orig, label=f'Original (AUC = {pr_auc_orig:.4f})')

# Undersampling model
precision_under, recall_under, _ = precision_recall_curve(y_test, y_prob_under)
pr_auc_under = auc(recall_under, precision_under)
plt.plot(recall_under, precision_under, label=f'Undersampling (AUC = {pr_auc_under:.4f})')

# SMOTE model
precision_smote, recall_smote, _ = precision_recall_curve(y_test, y_prob_smote)
pr_auc_smote = auc(recall_smote, precision_smote)
plt.plot(recall_smote, precision_smote, label=f'SMOTE (AUC = {pr_auc_smote:.4f})')

# Add baseline
plt.axhline(y=y_test.mean(), color='r', linestyle='--', 
            label=f'Baseline ({y_test.mean():.4f})')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='best')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Cell 10: Feature Importance Analysis
# Analyze which features are most predictive of fraud

# Get feature importance from coefficients (for SMOTE model)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model_smote.coef_[0]
})
feature_importance['Abs_Coefficient'] = abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values('Abs_Coefficient', ascending=False)

# Display top 15 features
print("Top 15 most important features:")
display(feature_importance.head(15))

# Visualize top 10 features
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(10)
sns.barplot(x='Coefficient', y='Feature', data=top_features, palette='viridis')
plt.title('Top 10 Most Important Features for Fraud Detection')
plt.xlabel('Coefficient Value (impact on log-odds)')
plt.ylabel('Feature')
plt.grid(True, axis='x')
plt.show()

# Show feature distributions for top 3 features
top3_features = feature_importance['Feature'].head(3).tolist()

plt.figure(figsize=(15, 5))
for i, feature in enumerate(top3_features):
    plt.subplot(1, 3, i+1)
    sns.kdeplot(df[df['Class'] == 0][feature], label='Normal')
    sns.kdeplot(df[df['Class'] == 1][feature], label='Fraud')
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Cell 11: Threshold Analysis
# Let's analyze how different probability thresholds affect model performance
# We'll use the SMOTE model as it had good overall performance

thresholds = [0.1, 0.3, 0.5, 0.7, 0.9]
threshold_metrics = []

plt.figure(figsize=(15, 10))

for i, threshold in enumerate(thresholds):
    # Make predictions with the current threshold
    y_pred_threshold = (y_prob_smote >= threshold).astype(int)
    
    # Calculate metrics
    precision = precision_score(y_test, y_pred_threshold)
    recall = recall_score(y_test, y_pred_threshold)
    f1 = f1_score(y_test, y_pred_threshold)
    
    # Store metrics
    threshold_metrics.append({
        'Threshold': threshold,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })
    
    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred_threshold)
    plt.subplot(2, 3, i+1)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
               xticklabels=['Normal', 'Fraud'],
               yticklabels=['Normal', 'Fraud'])
    plt.title(f'Threshold: {threshold}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')

plt.tight_layout()
plt.show()

# Display metrics table
threshold_df = pd.DataFrame(threshold_metrics)
display(threshold_df)

# Plot metrics vs threshold
plt.figure(figsize=(10, 6))
plt.plot(threshold_df['Threshold'], threshold_df['Precision'], 'bo-', label='Precision')
plt.plot(threshold_df['Threshold'], threshold_df['Recall'], 'ro-', label='Recall')
plt.plot(threshold_df['Threshold'], threshold_df['F1 Score'], 'go-', label='F1 Score')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Performance Metrics vs. Threshold')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
# Cell 12: Final Model Selection and Conclusion
# Compare all models

model_comparison = pd.DataFrame({
    'Model': ['Original (balanced)', 'Undersampling', 'SMOTE'],
    'Accuracy': [metrics_orig[0], metrics_under[0], metrics_smote[0]],
    'Precision': [metrics_orig[1], metrics_under[1], metrics_smote[1]],
    'Recall': [metrics_orig[2], metrics_under[2], metrics_smote[2]],
    'F1 Score': [metrics_orig[3], metrics_under[3], metrics_smote[3]],
    'ROC AUC': [roc_auc_orig, roc_auc_under, roc_auc_smote],
    'PR AUC': [pr_auc_orig, pr_auc_under, pr_auc_smote]
})

display(model_comparison)

# Visualize comparison
plt.figure(figsize=(14, 8))
metrics = ['Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC']

for i, metric in enumerate(metrics):
    plt.subplot(2, 3, i+1)
    sns.barplot(x='Model', y=metric, data=model_comparison)
    plt.title(metric)
    plt.ylim(0, 1)
    for j, val in enumerate(model_comparison[metric]):
        plt.text(j, val + 0.01, f'{val:.4f}', ha='center')

plt.tight_layout()
plt.show()

# Final conclusion
print("\n=== CONCLUSION ===")
print("Based on the evaluation metrics, here are the findings:")

# Identify best model by F1 score
best_model_idx = model_comparison['F1 Score'].idxmax()
best_model = model_comparison.loc[best_model_idx, 'Model']
best_f1 = model_comparison.loc[best_model_idx, 'F1 Score']
best_precision = model_comparison.loc[best_model_idx, 'Precision']
best_recall = model_comparison.loc[best_model_idx, 'Recall']

print(f"The best performing model is: {best_model}")
print(f"F1 Score: {best_f1:.4f}")
print(f"Precision: {best_precision:.4f} (How many predicted frauds are actually fraud)")
print(f"Recall: {best_recall:.4f} (What percentage of actual frauds were detected)")
print("\nRecommendation: For fraud detection, high recall is usually more important than high precision.")
print("This is because the cost of missing a fraudulent transaction (false negative) is typically higher")
print("than the cost of investigating a legitimate transaction (false positive).")