# Autoencoder (AE) Implementation with Nature-Inspired Optimization
### MSC/DSA/134

This notebook implements an Autoencoder for fraud detection. 
Optimization Goal: Find the best architecture (Encoder Layers, Decoder Layers, Latent Size, Units, Dropout) that maximizes the F1 score (Anomaly Detection performance).

In [1]:
# import libraries and dependencies

from globals.pandas_functions import *
import globals.hyperparameter_optimizer as hyp_optimizer
import globals.torch_gpu_processing as torch_gpu_processing
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import globals.ae_runner as ae_runner

In [2]:
# import datasets
data_base_path = "data/processed/null_value_option_1_with_validation_set/scaled_only"

X_train = pd.read_csv(f"{data_base_path}/unified_transaction_data_option2_x_train_scaled.csv")
X_validation = pd.read_csv(f"{data_base_path}/unified_transaction_data_option2_x_validation_scaled.csv")
X_test = pd.read_csv(f"{data_base_path}/unified_transaction_data_option2_x_test_scaled.csv")

y_train = pd.read_csv(f"{data_base_path}/unified_transaction_data_option2_y_train.csv")
y_validation = pd.read_csv(f"{data_base_path}/unified_transaction_data_option2_y_validation.csv")
y_test = pd.read_csv(f"{data_base_path}/unified_transaction_data_option2_y_test.csv")

print("X_train:", X_train.shape)
print("X_validation:", X_validation.shape)
print("X_test:", X_test.shape)

X_train: (354305, 26)
X_validation: (118102, 26)
X_test: (118102, 26)


In [4]:
torch_gpu_processing.test_direct_ml_processing()

DirectML device: privateuseone:0
Test operation successful: [2. 4.]


True

In [3]:
# get a sample for optimization (Non-Fraud Only!)
sample_size = 50000
seed = 42

def get_clean_sample(X, y, sample_size, random_state=42):
    # Filter for Class 0 (Non-Fraud)
    mask = (y.ravel() == 0)
    X_clean = X[mask]
    y_clean = y[mask]
    
    if sample_size >= len(X_clean): return X_clean, y_clean
    
    X_sample, _, y_sample, _ = train_test_split(
        X_clean, y_clean, train_size=sample_size, random_state=random_state
    )
    return X_sample, y_sample

X_train_sample, y_train_sample = get_clean_sample(
    X_train.to_numpy(),
    y_train.to_numpy().ravel(),
    sample_size=sample_size,
    random_state=seed
)

print(f"Optimization Sample Size (Non-Fraud Only): {len(X_train_sample)}")

Optimization Sample Size (Non-Fraud Only): 50000


In [5]:
# set meta data
# Settings
param_optimizer_algorithm = "PSO" # (FA, PSO, GWO)
population = 15
iterations = 5
epochs_for_evaluation = 10
batch_size = 1024
early_stopping = 3

In [5]:
best_hp = ae_runner.run_optimization(
    X_train_sample,
    y_train_sample,
    X_validation,
    y_validation,
    algorithm=param_optimizer_algorithm,
    population=population,
    iterations=iterations,
    batch_size=1024,
    epochs=10
)

Starting AE Optimization using PSO...
Settings: Pop=15, Iter=5, Batch=1024, Epochs=10
Optimizer using DEVICE: privateuseone:0
Stratified downsampling validation set to 5000 to preserve class distribution.
.......... [AUPRC: 0.0628 | F1: 0.1094 | ROC: 0.6248].......... [AUPRC: 0.0593 | F1: 0.1157 | ROC: 0.6497].......... [AUPRC: 0.0531 | F1: 0.0964 | ROC: 0.6168].......... [AUPRC: 0.0565 | F1: 0.0970 | ROC: 0.6391]........ [AUPRC: 0.0675 | F1: 0.1226 | ROC: 0.6859].......... [AUPRC: 0.0699 | F1: 0.1276 | ROC: 0.6940].......... [AUPRC: 0.0619 | F1: 0.1154 | ROC: 0.6422].......... [AUPRC: 0.0589 | F1: 0.1143 | ROC: 0.6383]......... [AUPRC: 0.0485 | F1: 0.0913 | ROC: 0.5893].......... [AUPRC: 0.0764 | F1: 0.1158 | ROC: 0.6748].......... [AUPRC: 0.0485 | F1: 0.1010 | ROC: 0.5640].......... [AUPRC: 0.0613 | F1: 0.1050 | ROC: 0.6376].......... [AUPRC: 0.0558 | F1: 0.1004 | ROC: 0.5747].......... [AUPRC: 0.0736 | F1: 0.1195 | ROC: 0.6905].......... [AUPRC: 0.0523 | F1: 0.1060 | ROC: 0.6019]...

In [6]:
# show best hyperparameters
print("Best Hyperparameters Found:")
print(best_hp)

Best Hyperparameters Found:
{'n_encoder_layers': 1, 'n_decoder_layers': 3, 'latent_size': 8, 'encoder_units': [480], 'encoder_activations': ['relu'], 'decoder_units': [192, 288, 464], 'decoder_activations': ['relu', 'selu', 'elu'], 'dropout_rate': 0.05165571983155763, 'batch_norm': True}


In [3]:
# temp set hyperparams
best_hp = {'n_encoder_layers': 1, 'n_decoder_layers': 3, 'latent_size': 8, 'encoder_units': [480], 'encoder_activations': ['relu'], 'decoder_units': [192, 288, 464], 'decoder_activations': ['relu', 'selu', 'elu'], 'dropout_rate': 0.05165571983155763, 'batch_norm': True}

In [6]:
# ==========================================
# FINAL MODEL TRAINING
# ==========================================

print("Using validation set for early stopping, test set for final evaluation only.")
model, metrics = torch_gpu_processing.train_final_ae_model(
    best_hp,
    X_train.to_numpy(),
    y_train.to_numpy(),
    X_validation.to_numpy(),  # Validation set for early stopping
    y_validation.to_numpy(),
    X_test.to_numpy(),  # Test set for final evaluation only
    y_test.to_numpy(),
    batch_size=batch_size,
    max_epochs=100  # Increased for better convergence
)

print("\nFinal Test Set Metrics:")
print("=" * 60)
print("Primary Metrics (Optimal Threshold - matches optimization):")
print(f"  Optimal F1:       {metrics.get('optimal_f1', 'N/A'):.4f}")
print(f"  Optimal Precision: {metrics.get('optimal_precision', 'N/A'):.4f}")
print(f"  Optimal Recall:    {metrics.get('optimal_recall', 'N/A'):.4f}")
print(f"  Optimal Threshold: {metrics.get('optimal_threshold', 'N/A'):.6f}")
if metrics.get('optimal_roc_auc') is not None:
    print(f"  Optimal ROC AUC:  {metrics.get('optimal_roc_auc', 'N/A'):.4f}")
if metrics.get('optimal_auprc') is not None:
    print(f"  Optimal AUPRC:    {metrics.get('optimal_auprc', 'N/A'):.4f}")

Using validation set for early stopping, test set for final evaluation only.
FINAL AE TRAINING (Max Epochs: 100)
Training AE on privateuseone:0 (Noise: 0.1)...
Epoch 1/100: Train Loss=0.107284, Val Loss=0.035825
Epoch 6/100: Train Loss=0.030193, Val Loss=0.021271
Epoch 11/100: Train Loss=0.027511, Val Loss=0.030900
Epoch 16/100: Train Loss=0.024267, Val Loss=0.042683
Early stopping triggered.

Evaluating AE Classification Performance on test set...

Results @ Optimal Threshold (0.005103):
  F1 Score:  0.1050
  Precision: 0.0588
  Recall:    0.4887

Final Test Set Metrics:
Primary Metrics (Optimal Threshold - matches optimization):
  Optimal F1:       0.1050
  Optimal Precision: 0.0588
  Optimal Recall:    0.4887
  Optimal Threshold: 0.005103
  Optimal ROC AUC:  0.6497
  Optimal AUPRC:    0.0554
