<a href="https://colab.research.google.com/github/NathanUsw/BankFraud/blob/main/BankFraudDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping, Callback
import gc
from tensorflow.keras import backend as K

In [None]:
class ClearMemoryCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        gc.collect()          # Collect garbage
        K.clear_session()     # Clear the Keras session to free memory

In [None]:
# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/ML_Projects/Combined_Dataset.csv')

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# List of categorical columns to encode
categorical_cols = ['payment_type', 'source', 'device_os', 'employment_status', 'housing_status']

# Apply LabelEncoder to each categorical column
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Fill missing values (if any) - optional
data = data.fillna(data.mean())

# Define the feature set (X) and the target (y)
X = data.drop(columns=['fraud_bool'])  # Features (all except the target variable)
y = data['fraud_bool']  # Target variable (fraud_bool)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the autoencoder model
input_dim = X_train_res.shape[1]

autoencoder = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape=(input_dim,)),  # Fewer neurons than before
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(8, activation='relu'),  # Reduce layers and neurons
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(input_dim, activation='linear')  # Reconstruct the input
])

# Compile the autoencoder model
autoencoder.compile(optimizer='adam', loss='mse')  # Use Mean Squared Error (MSE) for reconstruction loss

KeyError: 'payment_type'

In [None]:
# Train the autoencoder on the oversampled data (only on the non-fraud cases in original data)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
clear_memory = ClearMemoryCallback()

history = autoencoder.fit(X_train_res, X_train_res,
                          epochs=100,  # Set a high number and let early stopping decide
                          batch_size=16,
                          validation_split=0.2,
                          callbacks=[early_stopping, clear_memory],  # Use early stopping
                          verbose=1)

# Reconstruction errors on the test set
X_test_pred = autoencoder.predict(X_test)
reconstruction_error = np.mean(np.square(X_test - X_test_pred), axis=1)

# Set threshold for fraud detection
threshold = np.percentile(reconstruction_error, 95)  # Set at 95th percentile of reconstruction errors

# Predict fraud if reconstruction error is above threshold
y_pred = (reconstruction_error > threshold).astype(int)

# Evaluate the model
print(f'Threshold for Fraud Detection: {threshold}')
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix for Fraud Detection')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Calculate performance metrics
true_positives = conf_matrix[1, 1]
false_positives = conf_matrix[0, 1]
true_negatives = conf_matrix[0, 0]
false_negatives = conf_matrix[1, 0]

# Fraud detection rate (Precision, Recall, and F1-score)
precision = true_positives / (true_positives + false_positives + 1e-10)  # avoid division by zero
recall = true_positives / (true_positives + false_negatives + 1e-10)     # avoid division by zero
f1_score = 2 * (precision * recall) / (precision + recall + 1e-10)       # avoid division by zero

# Plot Fraud Detection Rates (Precision, Recall, F1)
metrics = ['Precision', 'Recall', 'F1-Score']
values = [precision, recall, f1_score]

plt.figure(figsize=(8, 6))
sns.barplot(x=metrics, y=values, palette='coolwarm')
plt.title('Fraud Detection Performance Metrics')
plt.ylim(0, 1)
for i, v in enumerate(values):
    plt.text(i, v + 0.02, f"{v:.2f}", ha='center', color='black', fontsize=12)
plt.ylabel('Rate')
plt.xlabel('Metric')
plt.show()

# Plot reconstruction error distribution
plt.figure(figsize=(10, 6))
sns.histplot(reconstruction_error, bins=50, kde=True, color='blue')
plt.axvline(threshold, color='red', linestyle='--', label=f'Threshold = {threshold:.4f}')
plt.title('Reconstruction Error Distribution')
plt.xlabel('Reconstruction Error')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# Plot validation loss over epochs
plt.figure(figsize=(8, 6))
sns.lineplot(x=range(1, len(history.history['loss']) + 1), y=history.history['loss'], label='Training Loss')
sns.lineplot(x=range(1, len(history.history['val_loss']) + 1), y=history.history['val_loss'], label='Validation Loss', color='red')
plt.title('Autoencoder Training Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
