In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_curve, auc, roc_curve, roc_auc_score, f1_score, recall_score, precision_score
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
import time

# Set random seed for reproducibility
np.random.seed(42)

# 1. DATA LOADING AND EXPLORATION
print("1. LOADING AND EXPLORING THE DATASET")
print("-" * 50)

# Load the dataset (assuming you have a CSV file)
# For this example, we'll use a placeholder for loading data
# In a real project, replace this with your actual dataset
# Example: df = pd.read_csv('creditcard.csv')

# Simulating data loading (in a real project, you would load actual data)
print("Loading dataset...")
try:
    # Try to load the dataset if available
    df = pd.read_csv('/content/creditcard.csv')
    print("Dataset loaded successfully.")
except:
    print("Dataset not found. Creating a synthetic dataset for demonstration.")
    # Create a synthetic dataset for demonstration
    n_samples = 10000
    n_features = 30

    # Generate synthetic features
    X = np.random.randn(n_samples, n_features)

    # Create imbalanced classes (0.5% fraud)
    n_fraud = int(0.005 * n_samples)
    y = np.zeros(n_samples)
    fraud_indices = np.random.choice(range(n_samples), size=n_fraud, replace=False)
    y[fraud_indices] = 1

    # Create dataframe
    feature_names = [f'V{i}' for i in range(1, n_features+1)]
    df = pd.DataFrame(X, columns=feature_names)
    df['Amount'] = np.abs(np.random.randn(n_samples) * 500 + 100)  # Transaction amount
    df['Time'] = np.random.randint(0, 172800, size=n_samples)  # Transaction time in seconds
    df['Class'] = y  # Target variable

    print(f"Created synthetic dataset with {n_samples} samples and {n_fraud} fraudulent transactions.")

# Display basic information about the dataset
print("\nDataset Information:")
print(f"Shape: {df.shape}")
print(f"Features: {df.columns.tolist()}")
print("\nSample data:")
print(df.head())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum().sum())

# Data distribution
print("\nClass distribution:")
print(df['Class'].value_counts())
print(f"Fraud percentage: {df['Class'].mean() * 100:.4f}%")

# 2. EXPLORATORY DATA ANALYSIS
print("\n2. EXPLORATORY DATA ANALYSIS")
print("-" * 50)

# Visualize class distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Class', data=df)
plt.title('Class Distribution (0: Normal, 1: Fraud)')
plt.savefig('class_distribution.png')
plt.close()
print("Class distribution plot saved.")

# Analyze transaction amount by class
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(df[df['Class'] == 0]['Amount'], bins=50, kde=True)
plt.title('Transaction Amount - Normal')
plt.xlim([0, 500])

plt.subplot(1, 2, 2)
sns.histplot(df[df['Class'] == 1]['Amount'], bins=50, kde=True, color='red')
plt.title('Transaction Amount - Fraud')
plt.xlim([0, 500])

plt.tight_layout()
plt.savefig('amount_distribution.png')
plt.close()
print("Transaction amount distribution plot saved.")

# Correlation matrix of features
plt.figure(figsize=(15, 12))
corr_matrix = df.corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.savefig('correlation_matrix.png')
plt.close()
print("Correlation matrix plot saved.")


# 3. DATA PREPROCESSING
print("\n3. DATA PREPROCESSING")
print("-" * 50)

# Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Remove rows with NaN values in the target variable 'Class'
df = df.dropna(subset=['Class'])
X = df.drop('Class', axis=1)
y = df['Class']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

# ... (rest of your code remains the same)
# Scale the features (except Time if it exists)
scaler = StandardScaler()
if 'Time' in X_train.columns:
    features_to_scale = X_train.drop('Time', axis=1).columns
    X_train[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])
    X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])
else:
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

print("Features scaled successfully.")

# Handle imbalanced data using SMOTE
print("\nHandling class imbalance with SMOTE...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"Original training set shape: {X_train.shape}")
print(f"Resampled training set shape: {X_train_resampled.shape}")
print(f"Class distribution after SMOTE: {pd.Series(y_train_resampled).value_counts().to_dict()}")

# 4. MODEL BUILDING AND EVALUATION
print("\n4. MODEL BUILDING AND EVALUATION")
print("-" * 50)

# Function to evaluate models
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    print(f"\nEvaluating {model_name}...")

    # Train the model and measure training time
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time

    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # AUC-ROC
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    # Precision-Recall AUC
    precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_proba)
    pr_auc = auc(recall_curve, precision_curve)

    # Print metrics
    print(f"Training time: {training_time:.2f} seconds")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC-ROC: {roc_auc:.4f}")
    print(f"PR AUC: {pr_auc:.4f}")

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)

    # Plot ROC curve
    plt.figure(figsize=(10, 8))

    # ROC Curve
    plt.subplot(2, 1, 1)
    plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.4f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend(loc='lower right')

    # PR Curve
    plt.subplot(2, 1, 2)
    plt.plot(recall_curve, precision_curve, label=f'PR AUC = {pr_auc:.4f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve - {model_name}')
    plt.legend(loc='lower left')

    plt.tight_layout()
    plt.savefig(f'{model_name.lower().replace(" ", "_")}_curves.png')
    plt.close()

    print(f"{model_name} evaluation plots saved.")

    # Return metrics for comparison
    return {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'AUC-ROC': roc_auc,
        'PR AUC': pr_auc,
        'Training Time': training_time
    }

# 4.1 Logistic Regression
print("\n4.1 LOGISTIC REGRESSION")
print("-" * 50)

# Initialize and evaluate Logistic Regression model
log_reg = LogisticRegression(C=1.0, class_weight='balanced', max_iter=1000, random_state=42)
log_reg_metrics = evaluate_model(log_reg, X_train_resampled, y_train_resampled, X_test, y_test, "Logistic Regression")

# 4.2 Random Forest
print("\n4.2 RANDOM FOREST")
print("-" * 50)

# Initialize and evaluate Random Forest model
rf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=10,
                           class_weight='balanced', random_state=42, n_jobs=-1)
rf_metrics = evaluate_model(rf, X_train_resampled, y_train_resampled, X_test, y_test, "Random Forest")

# 4.3 XGBoost
print("\n4.3 XGBOOST")
print("-" * 50)

# Initialize and evaluate XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5,
                            scale_pos_weight=99, # Adjust this based on class imbalance ratio
                            objective='binary:logistic', random_state=42, n_jobs=-1)
xgb_metrics = evaluate_model(xgb_model, X_train_resampled, y_train_resampled, X_test, y_test, "XGBoost")

# 5. MODEL COMPARISON
print("\n5. MODEL COMPARISON")
print("-" * 50)

# Collect all metrics
models_metrics = [log_reg_metrics, rf_metrics, xgb_metrics]
metrics_df = pd.DataFrame(models_metrics)
print("\nModel Comparison:")
print(metrics_df)

# Plot metrics comparison
metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC-ROC', 'PR AUC']
metrics_df_plot = metrics_df.set_index('Model')[metrics_to_plot]

plt.figure(figsize=(15, 10))
metrics_df_plot.plot(kind='bar', rot=0, figsize=(15, 8))
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.ylim([0, 1])
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig('model_comparison.png')
plt.close()
print("Model comparison plot saved.")

# Feature Importance Analysis
print("\n6. FEATURE IMPORTANCE ANALYSIS")
print("-" * 50)

# Random Forest Feature Importance
plt.figure(figsize=(12, 8))
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

print("Top 10 Important Features (Random Forest):")
print(feature_importance.head(10))

sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
plt.title('Feature Importance (Random Forest)')
plt.tight_layout()
plt.savefig('rf_feature_importance.png')
plt.close()
print("Random Forest feature importance plot saved.")

# XGBoost Feature Importance
plt.figure(figsize=(12, 8))
xgb_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 Important Features (XGBoost):")
print(xgb_importance.head(10))

sns.barplot(x='Importance', y='Feature', data=xgb_importance.head(15))
plt.title('Feature Importance (XGBoost)')
plt.tight_layout()
plt.savefig('xgb_feature_importance.png')
plt.close()
print("XGBoost feature importance plot saved.")

# 7. THRESHOLD OPTIMIZATION
print("\n7. THRESHOLD OPTIMIZATION")
print("-" * 50)

# Finding the optimal threshold for the best model (using XGBoost as an example)
y_pred_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

thresholds = np.arange(0.1, 0.9, 0.05)
f1_scores = []
precision_scores = []
recall_scores = []

for threshold in thresholds:
    y_pred_threshold = (y_pred_proba_xgb >= threshold).astype(int)
    f1_scores.append(f1_score(y_test, y_pred_threshold))
    precision_scores.append(precision_score(y_test, y_pred_threshold))
    recall_scores.append(recall_score(y_test, y_pred_threshold))

# Plot threshold vs metrics
plt.figure(figsize=(10, 6))
plt.plot(thresholds, f1_scores, label='F1 Score')
plt.plot(thresholds, precision_scores, label='Precision')
plt.plot(thresholds, recall_scores, label='Recall')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Metrics vs. Classification Threshold')
plt.legend()
plt.grid(True)
plt.savefig('threshold_optimization.png')
plt.close()
print("Threshold optimization plot saved.")

# Find optimal threshold for F1 score
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal threshold for F1 score: {optimal_threshold:.2f}")
print(f"Optimal F1 score: {f1_scores[optimal_idx]:.4f}")
print(f"Corresponding Precision: {precision_scores[optimal_idx]:.4f}")
print(f"Corresponding Recall: {recall_scores[optimal_idx]:.4f}")

# Apply optimal threshold
y_pred_optimal = (y_pred_proba_xgb >= optimal_threshold).astype(int)
print("\nMetrics with optimal threshold:")
print(classification_report(y_test, y_pred_optimal))

# 8. COST-SENSITIVE EVALUATION
print("\n8. COST-SENSITIVE EVALUATION")
print("-" * 50)

# Define cost matrix (example values)
# False Negative (missing fraud) is much more costly than False Positive
cost_fn = 100  # Cost of missing a fraud
cost_fp = 10   # Cost of falsely flagging a normal transaction

def calculate_cost(y_true, y_pred, cost_fn=100, cost_fp=10):
    cm = confusion_matrix(y_true, y_pred)
    fn = cm[1][0]  # False Negatives
    fp = cm[0][1]  # False Positives
    total_cost = fn * cost_fn + fp * cost_fp
    return total_cost

# Calculate costs for different thresholds
cost_per_threshold = []
for threshold in thresholds:
    y_pred_threshold = (y_pred_proba_xgb >= threshold).astype(int)
    cost = calculate_cost(y_test, y_pred_threshold, cost_fn, cost_fp)
    cost_per_threshold.append(cost)

# Plot cost vs threshold
plt.figure(figsize=(10, 6))
plt.plot(thresholds, cost_per_threshold)
plt.xlabel('Threshold')
plt.ylabel('Total Cost')
plt.title('Cost vs. Classification Threshold')
plt.grid(True)
plt.savefig('cost_threshold.png')
plt.close()
print("Cost threshold plot saved.")

# Find optimal threshold for cost
min_cost_idx = np.argmin(cost_per_threshold)
cost_optimal_threshold = thresholds[min_cost_idx]
print(f"Cost-optimal threshold: {cost_optimal_threshold:.2f}")
print(f"Minimum cost: {cost_per_threshold[min_cost_idx]:.2f}")

# Apply cost-optimal threshold
y_pred_cost_optimal = (y_pred_proba_xgb >= cost_optimal_threshold).astype(int)
print("\nMetrics with cost-optimal threshold:")
print(classification_report(y_test, y_pred_cost_optimal))
print(f"Confusion Matrix with cost-optimal threshold:")
print(confusion_matrix(y_test, y_pred_cost_optimal))

# 9. FINAL MODEL SELECTION
print("\n9. FINAL MODEL SELECTION")
print("-" * 50)

# Choose the best model based on comprehensive evaluation
print("Based on various metrics and considerations, we recommend:")

# Find the model with the highest F1 score
best_model_f1 = metrics_df.loc[metrics_df['F1 Score'].idxmax()]
print(f"Best model by F1 Score: {best_model_f1['Model']} with F1 = {best_model_f1['F1 Score']:.4f}")

# Find the model with the highest PR AUC
best_model_pr_auc = metrics_df.loc[metrics_df['PR AUC'].idxmax()]
print(f"Best model by PR AUC: {best_model_pr_auc['Model']} with PR AUC = {best_model_pr_auc['PR AUC']:.4f}")

# Find the model with the best balance of metrics (using mean of normalized metrics)
# Normalize all performance metrics
metrics_to_normalize = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC-ROC', 'PR AUC']
normalized_df = metrics_df.copy()
for metric in metrics_to_normalize:
    max_val = normalized_df[metric].max()
    min_val = normalized_df[metric].min()
    if max_val != min_val:
        normalized_df[metric] = (normalized_df[metric] - min_val) / (max_val - min_val)
    else:
        normalized_df[metric] = 1.0

# Add a balanced score (mean of normalized metrics)
normalized_df['Balanced Score'] = normalized_df[metrics_to_normalize].mean(axis=1)
best_balanced_model = normalized_df.loc[normalized_df['Balanced Score'].idxmax()]
print(f"Best overall balanced model: {best_balanced_model['Model']} with Balanced Score = {best_balanced_model['Balanced Score']:.4f}")

# Final recommendation
print("\nFINAL RECOMMENDATION:")
print(f"We recommend using {best_balanced_model['Model']} with a threshold of {cost_optimal_threshold:.2f} optimized for minimizing business costs.")
print("This model provides the best balance of precision and recall while minimizing the financial impact of fraud.")

# 10. DEPLOYMENT CONSIDERATIONS
print("\n10. DEPLOYMENT CONSIDERATIONS")
print("-" * 50)

print("""
Deployment Recommendations:
1. Model Serialization: Save the final model using joblib or pickle for deployment.
2. Real-time Inferencing: Consider implementing an API for real-time fraud detection.
3. Monitoring: Establish metrics to monitor model performance in production.
4. Retraining Strategy: Set up periodic retraining to adapt to new fraud patterns.
5. Explainability: Implement SHAP or LIME for transaction-level explanations.
6. Alerting System: Set up an alerting system for high-confidence fraud predictions.
7. Fallback Mechanisms: Implement fallback rules for when the model is uncertain.
""")

# Save the final model (XGBoost in this example)
import joblib
joblib.dump(xgb_model, 'xgb_fraud_detection_model.pkl')
print("Final model saved as 'xgb_fraud_detection_model.pkl'")

print("\nCREDIT CARD FRAUD DETECTION PROJECT COMPLETED!")
print("-" * 50)

# Sample code for model deployment (for demonstration)
print("""
# Sample code for model deployment (REST API using Flask)
'''
from flask import Flask, request, jsonify
import joblib
import numpy as np
import pandas as pd

app = Flask(__name__)

# Load the model
model = joblib.load('xgb_fraud_detection_model.pkl')
scaler = joblib.load('scaler.pkl')  # You would need to save this separately

@app.route('/predict', methods=['POST'])
def predict():
    # Get data from POST request
    data = request.json

    # Preprocess the data (same as in training)
    df = pd.DataFrame([data])

    # Scale the features
    features_to_scale = [col for col in df.columns if col != 'Time']
    df[features_to_scale] = scaler.transform(df[features_to_scale])

    # Make prediction
    probability = model.predict_proba(df)[0][1]
    is_fraud = probability >= 0.3  # Use the cost-optimal threshold

    # Return the result
    return jsonify({
        'probability': float(probability),
        'is_fraud': bool(is_fraud),
        'threshold': 0.3
    })

if __name__ == '__main__':
    app.run(debug=True)
'''
""")

1. LOADING AND EXPLORING THE DATASET
--------------------------------------------------
Loading dataset...
Dataset loaded successfully.

Dataset Information:
Shape: (107046, 31)
Features: ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class']

Sample data:
   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.3

<Figure size 1500x1000 with 0 Axes>