# Credit Card Fraud Detection
## Complete Analysis and Model Training

This notebook provides a comprehensive analysis of credit card fraud detection using machine learning.

### Table of Contents:
1. [Setup & Dependencies](#setup)
2. [Data Extraction](#extraction)
3. [Exploratory Data Analysis](#eda)
4. [Model Training](#training)

## 1. Setup & Dependencies Installation <a id='setup'></a>

Installing all required libraries for data analysis and machine learning.

In [None]:
# Install required packages
!pip install -q --upgrade pip
!pip install -q pandas numpy matplotlib seaborn scikit-learn imbalanced-learn xgboost joblib

In [None]:
# Import all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import zipfile
import os

# Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    roc_curve
)
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import joblib

# Settings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('All libraries imported successfully!')

## 2. Data Extraction <a id='extraction'></a>

Download and extract the credit card fraud dataset.

In [None]:
# Create data directory if it doesn't exist
data_dir = '../data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    print(f'Created directory: {data_dir}')

# Download the dataset
dataset_url = 'https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud'
!curl -L -o {data_dir}/creditcardfraud.zip {dataset_url}

In [None]:
# Extract the dataset
zip_path = f'{data_dir}/creditcardfraud.zip'
extract_path = data_dir

if os.path.exists(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print('Dataset extracted successfully!')
    print(f'Files in {data_dir}:')
    for file in os.listdir(data_dir):
        print(f'  - {file}')
else:
    print('Error: Zip file not found!')

In [None]:
# Load the dataset
df = pd.read_csv(f'{data_dir}/creditcard.csv')
print('Dataset loaded successfully!')
print(f'Dataset shape: {df.shape}')
df.head()

## 2.1 Data Splitting for Simulation and Training <a id='splitting'></a>

Splitting the dataset into Training, Testing, and Simulation sets.
Simulation sets are reserved for fog node simulation and contain fraudulent transactions.

In [None]:
# Separate fraud and normal transactions
fraud_df = df[df['Class'] == 1].copy()
normal_df = df[df['Class'] == 0].copy()

print(f'Total Fraud Cases: {len(fraud_df)}')
print(f'Total Normal Cases: {len(normal_df)}')
print("=" * 50)

# --- Simulation Node 1 ---
sim_node_1_fraud = fraud_df.sample(n=20, random_state=42)
sim_node_1_normal = normal_df.sample(n=2000, random_state=42)
simulation_node_1 = pd.concat([sim_node_1_fraud, sim_node_1_normal]).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Node 1 Fraud: {simulation_node_1['Class'].sum()}")
print(f"Node 1 Normal: {len(simulation_node_1) - simulation_node_1['Class'].sum()}")

# Save node1 simulation data
simulation_node_1.to_csv(f'{data_dir}/simulation_node_1.csv', index=False)
print(f'Simulation Node 1 saved: {simulation_node_1.shape}')

# Remove sampled data
fraud_df = fraud_df.drop(sim_node_1_fraud.index)
normal_df = normal_df.drop(sim_node_1_normal.index)
print("=" * 50)

# --- Simulation Node 2 ---
sim_node_2_fraud = fraud_df.sample(n=20, random_state=43)
sim_node_2_normal = normal_df.sample(n=2000, random_state=43)
simulation_node_2 = pd.concat([sim_node_2_fraud, sim_node_2_normal]).sample(frac=1, random_state=43).reset_index(drop=True)

print(f"Node 2 Fraud: {simulation_node_2['Class'].sum()}")
print(f"Node 2 Normal: {len(simulation_node_2) - simulation_node_2['Class'].sum()}")

# Remove sampled data
fraud_df = fraud_df.drop(sim_node_2_fraud.index)
normal_df = normal_df.drop(sim_node_2_normal.index)

# Save node2 simulation data
simulation_node_2.to_csv(f'{data_dir}/simulation_node_2.csv', index=False)
print(f'Simulation Node 2 saved: {simulation_node_2.shape}')
print("=" * 50)

# --- Train/Test Split ---
remaining_df = pd.concat([fraud_df, normal_df]).sample(frac=1, random_state=42).reset_index(drop=True)
train_df, test_df = train_test_split(remaining_df, test_size=0.2, stratify=remaining_df['Class'], random_state=42)

# Print train distributions
print(f"Train Fraud: {train_df['Class'].sum()}")
print(f"Train Normal: {len(train_df) - train_df['Class'].sum()}")

# Save Train file
train_df.to_csv(f'{data_dir}/train_data.csv', index=False)
print(f'Train Data saved: {train_df.shape}')
print("=" * 50)

# Print test distributions
print(f"Test Fraud: {test_df['Class'].sum()}")
print(f"Test Normal: {len(test_df) - test_df['Class'].sum()}")

# Save Test file
test_df.to_csv(f'{data_dir}/test_data.csv', index=False)
print(f'Test Data saved: {test_df.shape}')
print("=" * 50)

# Updated df for EDA
df = remaining_df
print(f'Updated main dataframe shape (Train + Test): {df.shape}')

## 3. Exploratory Data Analysis (EDA) <a id='eda'></a>

Comprehensive analysis of the dataset to understand patterns and characteristics.

### 3.1 Basic Dataset Information

In [None]:
# Dataset overview
print('Dataset Information:')
print('=' * 50)
print(f'Number of transactions: {len(df):,}')
print(f'Number of features: {len(df.columns)}')
print(f'\nColumn names:\n{df.columns.tolist()}')
print(f'\nData types:\n{df.dtypes}')
print(f'\nMemory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB')

In [None]:
# Check for missing values and duplicates
print('Data Quality Check:')
print('=' * 50)
print(f'Missing values per column:\n{df.isnull().sum()}')
print(f'\nTotal missing values: {df.isnull().sum().sum()}')
print(f'\nDuplicate rows: {df.duplicated().sum()}')
print(f'\nUnique classes: {df["Class"].unique()}')

In [None]:
# Statistical summary
print('Statistical Summary:')
print('=' * 50)
df.describe()

### 3.2 Class Distribution Analysis

In [None]:
# Analyze class distribution
fraud_count = df['Class'].value_counts()
fraud_percentage = df['Class'].value_counts(normalize=True) * 100

print('Class Distribution:')
print('=' * 50)
print(f'Normal transactions (0): {fraud_count[0]:,} ({fraud_percentage[0]:.4f}%)')
print(f'Fraudulent transactions (1): {fraud_count[1]:,} ({fraud_percentage[1]:.4f}%)')
print(f'\nFraud rate: {fraud_percentage[1]:.4f}%')
print(f'Imbalance ratio: 1:{fraud_count[0]/fraud_count[1]:.0f}')

In [None]:
# Visualize class distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
sns.countplot(x='Class', data=df, ax=ax1, palette=['#3498db', '#e74c3c'])
ax1.set_title('Fraud vs Normal Transaction Count', fontsize=14, fontweight='bold')
ax1.set_xlabel('Class (0 = Normal, 1 = Fraud)', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax1.set_yscale('log')  # Log scale to see fraud cases better

# Pie chart
colors = ['#3498db', '#e74c3c']
explode = (0, 0.1)
ax2.pie(fraud_count, labels=['Normal', 'Fraud'], autopct='%1.4f%%', 
        colors=colors, explode=explode, shadow=True, startangle=90)
ax2.set_title('Class Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

### 3.3 Feature Analysis

In [None]:
# Time feature analysis
fig, axes = plt.subplots(2, 1, figsize=(14, 8))

# Distribution of Time
axes[0].hist(df['Time'], bins=50, color='#3498db', edgecolor='black')
axes[0].set_title('Distribution of Transaction Times', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Time (seconds)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)

# Time distribution by class
df[df['Class'] == 0]['Time'].hist(bins=50, alpha=0.5, label='Normal', ax=axes[1], color='#3498db')
df[df['Class'] == 1]['Time'].hist(bins=50, alpha=0.5, label='Fraud', ax=axes[1], color='#e74c3c')
axes[1].set_title('Transaction Time Distribution by Class', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Time (seconds)', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Amount feature analysis
print('Amount Statistics by Class:')
print('=' * 50)
print('Normal transactions:')
print(df[df['Class'] == 0]['Amount'].describe())
print('\nFraudulent transactions:')
print(df[df['Class'] == 1]['Amount'].describe())

# Visualize amount distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
sns.boxplot(x='Class', y='Amount', data=df, ax=axes[0], palette=['#3498db', '#e74c3c'])
axes[0].set_title('Amount Distribution by Class', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Class (0 = Normal, 1 = Fraud)', fontsize=12)
axes[0].set_ylabel('Transaction Amount', fontsize=12)

# Histogram
df[df['Class'] == 0]['Amount'].hist(bins=50, alpha=0.5, label='Normal', ax=axes[1], color='#3498db')
df[df['Class'] == 1]['Amount'].hist(bins=50, alpha=0.5, label='Fraud', ax=axes[1], color='#e74c3c')
axes[1].set_title('Amount Distribution Comparison', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Amount', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_xlim([0, 500])  # Limit for better visibility
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis for key features
# Select a subset of features for correlation (V1-V10 + Amount + Class)
corr_features = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'Amount', 'Class']
correlation_matrix = df[corr_features].corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Correlation Matrix (Selected Features)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 4. Model Training <a id='training'></a>

Training and evaluating machine learning models for fraud detection.

### 4.1 Data Preparation

In [None]:
# Load the pre-split training and testing data
train_df = pd.read_csv(f'{data_dir}/train_data.csv')
test_df = pd.read_csv(f'{data_dir}/test_data.csv')

# Separate features and target
X_train = train_df.drop('Class', axis=1)
y_train = train_df['Class']
X_test = test_df.drop('Class', axis=1)
y_test = test_df['Class']

print('Data Preparation (Loaded from Split Files):')
print('=' * 50)
print(f'Training set: {X_train.shape[0]:,} samples')
print(f'Testing set: {X_test.shape[0]:,} samples')
print(f'\nClass distribution in training set:')
print(y_train.value_counts())
print(f'\nClass distribution in testing set:')
print(y_test.value_counts())

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print('Features scaled successfully using StandardScaler')

### 4.2 Handling Class Imbalance with SMOTE

In [None]:
# Apply SMOTE to balance the dataset
print('Applying SMOTE...')
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

print('\nClass distribution after SMOTE:')
print('=' * 50)
print(pd.Series(y_train_balanced).value_counts())
print(f'\nBalanced training set size: {len(X_train_balanced):,}')

### 4.3 Model Training

In [None]:
# Train Logistic Regression
print('Training Logistic Regression...')
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_balanced, y_train_balanced)
print('Logistic Regression trained')

In [None]:
# Train Random Forest
print('Training Random Forest...')
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_balanced, y_train_balanced)
print('Random Forest trained')

In [None]:
# Train XGBoost
print('Training XGBoost...')
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb_model.fit(X_train_balanced, y_train_balanced)
print('XGBoost trained')

### 4.4 Model Evaluation

In [None]:
# Function to evaluate model performance
from sklearn.metrics import fbeta_score

def evaluate_model(model, model_name, X_test, y_test):
    """Evaluate model and print metrics"""
    print(f'\n{model_name} Performance:')
    print('=' * 60)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    f2 = fbeta_score(y_test, y_pred, beta=2) # F2 Score favors recall
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Print metrics
    print(f'Accuracy:  {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall:    {recall:.4f}')
    print(f'F1-Score:  {f1:.4f}')
    print(f'F2-Score:  {f2:.4f}')
    print(f'ROC-AUC:   {roc_auc:.4f}')
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f'\nConfusion Matrix:')
    print(cm)
    
    # Classification Report
    print(f'\nClassification Report:')
    print(classification_report(y_test, y_pred, target_names=['Normal', 'Fraud']))
    
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 
            'f1': f1, 'f2': f2, 'roc_auc': roc_auc, 'cm': cm, 
            'y_pred': y_pred, 'y_pred_proba': y_pred_proba}

In [None]:
# Evaluate all models
lr_results = evaluate_model(lr_model, 'Logistic Regression', X_test_scaled, y_test)
rf_results = evaluate_model(rf_model, 'Random Forest', X_test_scaled, y_test)
xgb_results = evaluate_model(xgb_model, 'XGBoost', X_test_scaled, y_test)

In [None]:
# Compare models
comparison_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'Accuracy': [lr_results['accuracy'], rf_results['accuracy'], xgb_results['accuracy']],
    'Precision': [lr_results['precision'], rf_results['precision'], xgb_results['precision']],
    'Recall': [lr_results['recall'], rf_results['recall'], xgb_results['recall']],
    'F1-Score': [lr_results['f1'], rf_results['f1'], xgb_results['f1']],
    'F2-Score': [lr_results['f2'], rf_results['f2'], xgb_results['f2']],
    'ROC-AUC': [lr_results['roc_auc'], rf_results['roc_auc'], xgb_results['roc_auc']]
})

print('\nModel Comparison:')
print('=' * 80)
print(comparison_df.to_string(index=False))

# Plot comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar plot for all metrics
comparison_df.set_index('Model')[['Accuracy', 'Precision', 'Recall', 'F1-Score', 'F2-Score', 'ROC-AUC']].plot(
    kind='bar', ax=axes[0], rot=45
)
axes[0].set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Score', fontsize=12)
axes[0].legend(loc='lower right')
axes[0].set_ylim([0.8, 1.0])

# ROC Curves
for name, results in [('Logistic Regression', lr_results), 
                       ('Random Forest', rf_results), 
                       ('XGBoost', xgb_results)]:
    fpr, tpr, _ = roc_curve(y_test, results['y_pred_proba'])
    axes[1].plot(fpr, tpr, label=f'{name} (AUC = {results["roc_auc"]:.4f})')

axes[1].plot([0, 1], [0, 1], 'k--', label='Random Classifier')
axes[1].set_xlabel('False Positive Rate', fontsize=12)
axes[1].set_ylabel('True Positive Rate', fontsize=12)
axes[1].set_title('ROC Curves', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Plot confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, (name, results) in enumerate([('Logistic Regression', lr_results), 
                                        ('Random Forest', rf_results), 
                                        ('XGBoost', xgb_results)]):
    sns.heatmap(results['cm'], annot=True, fmt='d', cmap='Blues', ax=axes[idx])
    axes[idx].set_title(f'{name}\nConfusion Matrix', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel('True Label', fontsize=10)
    axes[idx].set_xlabel('Predicted Label', fontsize=10)

plt.tight_layout()
plt.show()

### 4.5 Save Best Model

In [None]:
# Determine best model based on F2-Score (optimizing for Recall and F1)
# F2 score weights recall higher than precision, which is crucial for fraud detection
best_model_idx = comparison_df['F2-Score'].idxmax()
best_model_name = comparison_df.loc[best_model_idx, 'Model']
best_model = [lr_model, rf_model, xgb_model][best_model_idx]

print(f'Best Model: {best_model_name}')
print(f'F2-Score: {comparison_df.loc[best_model_idx, "F2-Score"]:.4f}')
print(f'Recall: {comparison_df.loc[best_model_idx, "Recall"]:.4f}')
print(f'F1-Score: {comparison_df.loc[best_model_idx, "F1-Score"]:.4f}')

# Save the best model and scaler
model_dir = '../models'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    print(f'Created directory: {model_dir}')

best_model_name = best_model_name.replace(' ', '_').lower()
joblib.dump(best_model, f'{model_dir}/best_model_{best_model_name}.pkl')
joblib.dump(scaler, f'{model_dir}/scaler.pkl')

print(f'\nBest model saved as: {model_dir}/best_model_{best_model_name}.pkl')
print(f'Scaler saved as: {model_dir}/scaler.pkl')