<a href="https://colab.research.google.com/github/JayNguyen-123/Anomaly_Detection_Credit_Card_Transactions./blob/main/Anomaly_Detection_Credit_card_Transactions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Anomaly Detection
- Anomaly detection is a critical component of data analysis across various domains such as financial, cybersecurity, healthcare and more.
- Anomalies, often referred to as outliers or anomalies, are data points or observations that significantly deviate from the expected or normal behavior within a dataset. These deviations can be caused by various factors, such as errors in data collection, rare events, system malfunctions, or even intentional fraudulent activities.




In [1]:
!pip install pyod

import pandas as pd
import numpy as np
from pyod.models.xgbod import XGBOD
from pyod.models.pca import PCA
from pyod.models.iforest import IForest
from pyod.models.cblof import CBLOF
from pyod.models.hbos import HBOS

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (precision_recall_curve, average_precision_score, roc_auc_score)

from xgboost import XGBClassifier

Collecting pyod
  Downloading pyod-2.0.5-py3-none-any.whl.metadata (46 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.3/46.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Downloading pyod-2.0.5-py3-none-any.whl (200 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.6/200.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyod
Successfully installed pyod-2.0.5


In [4]:
# Load data
df = pd.read_csv('/content/creditcard.csv')
X, y = df.drop(columns='Class').values, df['Class'].values

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Dataset shape: {X.shape}")
print(f"Fraud rate (%): {y.mean()*100:.4f}")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")


Dataset shape: (284807, 30)
Fraud rate (%): 0.1727
Training set: 199364 samples
Test set: 85443 samples


In [5]:
def create_supervised_labels(y_train, supervision_ratio=0.01):
  """ Create supervised labels on supervision ratio."""

  fraud_indices = np.where(y_train == 1)[0]
  n_labeled_fraud = int(len(fraud_indices) * supervision_ratio)

  # Randomly select labeled samples
  labeled_fraud_idx = np.random.choice(fraud_indices,
                                       n_labeled_fraud, replace=False)

  # Create labels
  y_labels = np.zeros_like(y_train)
  y_labels[labeled_fraud_idx] = 1

  # Calculate how many true are in the "unlabeled" set
  unlabeled_fraud_count = len(fraud_indices) - n_labeled_fraud

  return y_labels, labeled_fraud_idx, unlabeled_fraud_count


In [6]:
def evaluate_model(model, X_test, y_test, model_name):
  """Evaluate a single model and return metrics."""

  # Get anomaly scores
  scores = model.decision_function(X_test)

  # Calculate metrics
  auc_pr = average_precision_score(y_test, scores)

  return {
      'model': model_name,
      'auc_pr': auc_pr,
      'scores': scores
  }

In [7]:
def create_model(model, X_test, y_test, model_name):
  """Evaluate a single model and return metrics."""
  # Get anomaly scores
  scores = model.decision_function(X_test)

  # Calculate metrics
  auc_pr = average_precision_score(y_test, scores)

  return {
      'model': model_name,
      'auc_pr': auc_pr,
      'scores': scores
  }

In [8]:
# Unsupervised Anomaly Detection
models = {
    'IsolationForest': IForest(random_state=42),
    'CBLOF': CBLOF(),
    'HBOS': HBOS(),
    'PCA': PCA(),
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train)
    result = evaluate_model(model, X_test, y_test, name)
    print(f"{name:20} - AUC-PR: {result['auc_pr']:.4f}")


Training IsolationForest...
IsolationForest      - AUC-PR: 0.1497
Training CBLOF...
CBLOF                - AUC-PR: 0.1521
Training HBOS...
HBOS                 - AUC-PR: 0.2488
Training PCA...
PCA                  - AUC-PR: 0.1411


- With zero hyperparameter tuning, none of the algorithms delivered very promising results, as their AUCPR values (~0.15–0.25) may fall short of the very high precision/recall often required in fraud-detection settings.
- However, we should note that, unlike AUC-ROC, which has a baseline value of 0.5, the baseline AUCPR depends on the prevalence of the positive class. For our current dataset, since only 0.17% of the samples are fraud, a naive classifier that guesses randomly would have an AUCPR ≈ 0.0017. In that sense, all detectors already outperform random guessing by a wide margin.


### XGBOD Approach
- XGBOD(Extreme Gradient Boosting Outliner Detection) is a semi-supervised framework designed for high-perfromance outliner detection.
- It combines the strengths of both supervised and unsupervised learning methods to enhance the detection of outliers.


In [9]:
supervision_ratios = [0.01, 0.02, 0.05, 0.1, 0.15, 0.2]

for ratio in supervision_ratios:
  # Create supervised labels
  y_labels, labeled_fraud_idx, unlabeled_fraud_count = create_supervised_labels(y_train, ratio)

  total_fraud = sum(y_train)
  labeled_fraud = sum(y_labels)

  print(f"Know frauds (labeled as 1): {labeled_fraud}")
  print(f"Hidden frauds in 'normal' data: {unlabeled_fraud_count}")
  print(f"Total samples treated as normal: {len(y_train) - labeled_fraud}")
  print(f"Fraud contamination in 'normal' set: {unlabeled_fraud_count / (len(y_train) - labeled_fraud) * 100:.3f}%")

  # Train XGBOD models
  xgbod = XGBOD(estimator_list=[PCA(), CBLOF(), IForest(), HBOS()],
                random_state=42,
                n_estimators=200, learning_rate=0.1,
                eval_metric='aucpr')

  xgbod.fit(X_train, y_labels)
  result = evaluate_model(xgbod, X_test, y_test, f"XGBOD_ratio_{ratio:.3f}")
  print(f"xgbod - AUC-PR: {result['auc_pr']:.4f}")


Know frauds (labeled as 1): 3
Hidden frauds in 'normal' data: 341
Total samples treated as normal: 199361
Fraud contamination in 'normal' set: 0.171%


Parameters: { "silent" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


xgbod - AUC-PR: 0.5164
Know frauds (labeled as 1): 6
Hidden frauds in 'normal' data: 338
Total samples treated as normal: 199358
Fraud contamination in 'normal' set: 0.170%


Parameters: { "silent" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


xgbod - AUC-PR: 0.4546
Know frauds (labeled as 1): 17
Hidden frauds in 'normal' data: 327
Total samples treated as normal: 199347
Fraud contamination in 'normal' set: 0.164%


Parameters: { "silent" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


xgbod - AUC-PR: 0.6606
Know frauds (labeled as 1): 34
Hidden frauds in 'normal' data: 310
Total samples treated as normal: 199330
Fraud contamination in 'normal' set: 0.156%


Parameters: { "silent" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


xgbod - AUC-PR: 0.6965
Know frauds (labeled as 1): 51
Hidden frauds in 'normal' data: 293
Total samples treated as normal: 199313
Fraud contamination in 'normal' set: 0.147%


Parameters: { "silent" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


xgbod - AUC-PR: 0.7161
Know frauds (labeled as 1): 68
Hidden frauds in 'normal' data: 276
Total samples treated as normal: 199296
Fraud contamination in 'normal' set: 0.138%


Parameters: { "silent" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


xgbod - AUC-PR: 0.7237


In [12]:
# Supervised Learning
for ratio in supervision_ratios:

  # Create supervised labels
  y_label, labeled_fraud_idx, unlabeled_fraud_count = create_supervised_labels(y_train, ratio)

  clf = XGBClassifier(n_estimators=200, random_state=42,
                      learning_rate=0.1, eval_metric='aucpr')
  clf.fit(X_train, y_label)

  y_pred_proba = clf.predict_proba(X_test)[:, 1]
  auc_pr = average_precision_score(y_test, y_pred_proba)
  print(f"XGBoost - AUC-PR: {auc_pr:.4f}")



XGBoost - AUC-PR: 0.4786
XGBoost - AUC-PR: 0.4202
XGBoost - AUC-PR: 0.6502
XGBoost - AUC-PR: 0.5564
XGBoost - AUC-PR: 0.7312
XGBoost - AUC-PR: 0.6318
