# 03 – Modeling

Train various anomaly detection models and compare their performance.

## 3.1 – Imports and Constants

In [10]:
import numpy as np
import pandas as pd

from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve

import joblib

# For the autoencoder
import tensorflow as tf
from tensorflow.keras import layers, models

# For reproducibility
RANDOM_SEED = 31
MODEL_DIR = 'sample/models/'
DATA_PATH = 'sample/processed_pca.pkl'

## 3.2 – Load Preprocessed Data

In [13]:
# This should load a tuple: (X_train, X_test, y_train, y_test)
X_train, X_test, y_train, y_test = joblib.load(DATA_PATH)

print(f"X_train shape: {X_train.shape}")
print(f"y_train distribution:\n{pd.Series(y_train).value_counts()}\n")

X_train shape: (2793, 16)
y_train distribution:
Class
0    2399
1     394
Name: count, dtype: int64



## 3.3 – Preparation: Filter “Normal” Transactions

In [15]:
# Both IsolationForest and OneClassSVM are unsupervised, so we train them only on “normal” (non-fraud) samples.
# In this dataset, class = 0 indicates normal transactions; class = 1 indicates fraud.

mask_normal = (y_train == 0)
X_train_normal = X_train[mask_normal]

print(f"Number of “normal” samples for training: {X_train_normal.shape[0]}")

Number of “normal” samples for training: 2399


## 3.4 – Isolation Forest

In [35]:
394/2399

0.16423509795748228

In [37]:
# Initialize Isolation Forest
iso_forest = IsolationForest(
    n_estimators=100,
    max_samples='auto',
    contamination=0.16424,  # Approximate fraud rate
    random_state=RANDOM_SEED,
    n_jobs=-1
)

# Fit on X_train_normal only
iso_forest.fit(X_train_normal)

# Save the trained model to disk
joblib.dump(iso_forest, MODEL_DIR + 'isolation_forest.pkl')

print("Isolation Forest trained and saved.")

Isolation Forest trained and saved.


## 3.5 – One-Class SVM

In [41]:
# Initialize One-Class SVM
ocsvm = OneClassSVM(
    kernel='rbf',
    gamma='auto',
    nu=0.16424  # Approximate fraud rate
)

# Fit on X_train_normal only
ocsvm.fit(X_train_normal)

# Save the trained model to disk
joblib.dump(ocsvm, MODEL_DIR + 'ocsvm.pkl')

print("One-Class SVM trained and saved.")

One-Class SVM trained and saved.


## 3.6 – Autoencoder 

In [26]:
# Define a simple autoencoder architecture
input_dim = X_train_normal.shape[1]
encoding_dim = 16  # Tune as needed

input_layer = layers.Input(shape=(input_dim,))
encoded     = layers.Dense(encoding_dim, activation='relu')(input_layer)
decoded     = layers.Dense(input_dim, activation='sigmoid')(encoded)

autoencoder = models.Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# Train autoencoder on normal transactions only
history = autoencoder.fit(
    X_train_normal, X_train_normal,
    epochs=20,            # Adjust based on convergence
    batch_size=256,       # Adjust to fit your memory
    validation_split=0.1, # 10% for validation
    shuffle=True
)

# Save the entire model (architecture + weights)
autoencoder.save(MODEL_DIR + 'autoencoder.h5')

print("Autoencoder trained and saved.")

Epoch 1/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - loss: 2.0465 - val_loss: 1.7480
Epoch 2/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 1.9855 - val_loss: 1.7211
Epoch 3/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 2.0232 - val_loss: 1.6946
Epoch 4/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 1.8808 - val_loss: 1.6685
Epoch 5/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 1.9702 - val_loss: 1.6426
Epoch 6/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 1.8370 - val_loss: 1.6170
Epoch 7/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 1.9149 - val_loss: 1.5916
Epoch 8/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 1.7506 - val_loss: 1.5660
Epoch 9/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [



Autoencoder trained and saved.


## 3.8 – Evaluation Pipeline

### 3.8.a – Evaluate Isolation Forest on Test Set

In [39]:
y_pred_iso = iso_forest.predict(X_test)
# Convert to binary {0: normal, 1: fraud}
y_pred_iso_binary = np.where(y_pred_iso == -1, 1, 0)

print("Isolation Forest Classification Report:")
print(classification_report(y_test, y_pred_iso_binary))
print("ROC-AUC:", roc_auc_score(y_test, iso_forest.decision_function(X_test)))


Isolation Forest Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.84      0.90       601
           1       0.48      0.89      0.62        98

    accuracy                           0.85       699
   macro avg       0.73      0.86      0.76       699
weighted avg       0.91      0.85      0.86       699

ROC-AUC: 0.08643757003633398


### 3.8.b – Evaluate One-Class SVM on Test Set

In [43]:

y_pred_svm = ocsvm.predict(X_test)
y_pred_svm_binary = np.where(y_pred_svm == -1, 1, 0)

print("\nOne-Class SVM Classification Report:")
print(classification_report(y_test, y_pred_svm_binary))


One-Class SVM Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.84      0.91       601
           1       0.48      0.92      0.63        98

    accuracy                           0.85       699
   macro avg       0.73      0.88      0.77       699
weighted avg       0.91      0.85      0.87       699



# 3.8.d – Evaluate Autoencoder on Test Set

In [54]:
reconstructions = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - reconstructions, 2), axis=1)

# Choose threshold as 95th percentile of training MSE
reconstructions_train = autoencoder.predict(X_train_normal)
mse_train = np.mean(np.power(X_train_normal - reconstructions_train, 2), axis=1)
threshold = np.percentile(mse_train, 95)

y_pred_ae = (mse > threshold).astype(int)

print("Autoencoder Classification Report:")
print(classification_report(y_test, y_pred_ae))
print("ROC-AUC (approx with reconstruction error as score):", roc_auc_score(y_test, mse))

[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Autoencoder Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.94      0.96       601
           1       0.71      0.86      0.77        98

    accuracy                           0.93       699
   macro avg       0.84      0.90      0.87       699
weighted avg       0.94      0.93      0.93       699

ROC-AUC (approx with reconstruction error as score): 0.9335461305986622
