# This notebook is dedicated to the state-of-the art model implementation of the Research Project
### MSC/DSA/134

In [1]:
# import required packages
from sklearn.svm import SVC
import time
from globals.pandas_functions import *
import xgboost as xgb
from sklearn.metrics import accuracy_score
import joblib
import globals.data_utils as data_utils

In [2]:
# import evaluation functions
from globals.model_evaluations import (
    evaluate_accuracy, evaluate_precision, evaluate_recall, 
    evaluate_f1_score, evaluate_roc_auc
)
from sklearn.metrics import roc_auc_score, average_precision_score

In [3]:
# define model export path
fitted_models_base = "models/state_of_the_art/"

#### State-of-the-art Models to be implemented:
- SVM
- XGBoost

## SVM Model Implementation

In [2]:
# load preprocessed training and testing data
data_base_path = "data/processed/null_value_option_1/scaled_and_balanced"

X_train = pd.read_csv(f"{data_base_path}/pca_selected_features/unified_transaction_data_option1_x_train_pca.csv")
X_test = pd.read_csv(f"{data_base_path}/pca_selected_features/unified_transaction_data_option1_x_test_pca.csv")
y_train = pd.read_csv(f"{data_base_path}/unified_transaction_data_option1_y_train_balanced.csv")
y_test = pd.read_csv(f"{data_base_path}/unified_transaction_data_option1_y_test.csv")

In [4]:
# without PCA selected features
data_base_path = "data/processed/null_value_option_1_with_validation_set/scaled_and_balanced"


X_train = pd.read_csv(f"{data_base_path}/unified_transaction_data_option2_x_train_balanced.csv")
X_validation = pd.read_csv(f"{data_base_path}/unified_transaction_data_option2_x_validation_scaled.csv")
X_test = pd.read_csv(f"{data_base_path}/unified_transaction_data_option2_x_test_scaled.csv")

y_train = pd.read_csv(f"{data_base_path}/unified_transaction_data_option2_y_train_balanced.csv")
y_validation = pd.read_csv(f"{data_base_path}/unified_transaction_data_option2_y_validation.csv")
y_test = pd.read_csv(f"{data_base_path}/unified_transaction_data_option2_y_test.csv")

In [5]:
sample_size = 100000

In [6]:
dataset_dimension("X_train", X_train)
dataset_dimension("X_test", X_test)

X_train dataset dimension: (683822, 26)
X_test dataset dimension: (118102, 26)


In [7]:
# get sample for SVM training due to computational constraints
X_train_sample, y_train_sample = data_utils.get_stratified_sample(X_train, y_train, sample_size)

In [12]:
# view class distributions
data_utils.show_class_distribution(X_train, y_train.to_numpy().ravel(), "Class distribution of original dataset")
data_utils.show_class_distribution(X_train_sample, y_train_sample.to_numpy().ravel(), "Class distribution of SVM training sample dataset")


Class distribution of original dataset:
  Total samples: 683822
  Y df samples:  [0 0 0 ... 1 1 1]
  Class 0 (non-fraud): 341911 (50.00%)
  Class 1 (fraud): 341911 (50.00%)

Class distribution of SVM training sample dataset:
  Total samples: 100000
  Y df samples:  [0 0 1 ... 0 0 1]
  Class 0 (non-fraud): 50000 (50.00%)
  Class 1 (fraud): 50000 (50.00%)


model model parameters are inspired from the literature review <br>
set of model parameters are defined in this Google Sheet: https://docs.google.com/spreadsheets/d/17DAOxBz-xashyfk6qFNaAYGm2XN98F8GiNUozh2BRlc/edit?usp=sharing

In [13]:
# define and train SVM model
max_iter = 1000000
svm_model = SVC(kernel="linear", C=1.0, random_state=42, max_iter=max_iter, verbose=1, probability=True)


start_time = time.time()
svm_model.fit(X_train[:sample_size].to_numpy(), y_train[:sample_size].to_numpy().ravel())
end_tme = time.time()

[LibSVM]



In [14]:
# export svm
model_name = "svm_linear_model.joblib"
joblib.dump(svm_model, fitted_models_base + model_name)

['models/state_of_the_art/svm_linear_model.joblib']

In [15]:
y_pred = svm_model.predict(X_test.to_numpy())
y_pred_probability = svm_model.predict_proba(X_test.to_numpy())

In [None]:
svm_accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Test Accuracy: {svm_accuracy:.4f}")

## SVM Model Evaluation

In [17]:
# comprehensive SVM evaluation metrics
y_test_flat = y_test.to_numpy().ravel()
y_pred_proba_pos = y_pred_probability[:, 1]

print("="*50)
print("SVM MODEL EVALUATION")
print("="*50)

# basic metrics
svm_acc = evaluate_accuracy(y_test_flat, y_pred)
svm_prec = evaluate_precision(y_test_flat, y_pred, average="binary")
svm_rec = evaluate_recall(y_test_flat, y_pred, average="binary")
svm_f1 = evaluate_f1_score(y_test_flat, y_pred, average="binary")

# auc metrics
svm_auroc = roc_auc_score(y_test_flat, y_pred_proba_pos)
svm_auprc = average_precision_score(y_test_flat, y_pred_proba_pos)

print(f"AUROC: {svm_auroc:.4f}")
print(f"AUPRC: {svm_auprc:.4f}")

print(f"\nSummary Metrics:")
print(f"  Accuracy:  {svm_acc:.4f}")
print(f"  Precision: {svm_prec:.4f}")
print(f"  Recall:    {svm_rec:.4f}")
print(f"  F1 Score:  {svm_f1:.4f}")
print(f"  AUROC:     {svm_auroc:.4f}")
print(f"  AUPRC:     {svm_auprc:.4f}")

SVM MODEL EVALUATION
Accuracy: 0.9659 (96.59%)
Precision (binary): 0.9569
Recall (binary): 0.0269
F1-Score (binary): 0.0523
AUROC: 0.6838
AUPRC: 0.2699

Summary Metrics:
  Accuracy:  0.9659
  Precision: 0.9569
  Recall:    0.0269
  F1 Score:  0.0523
  AUROC:     0.6838
  AUPRC:     0.2699


## XGB Model Implementation

In [11]:
gpu_xgb_model = xgb.XGBClassifier(
    tree_method="hist",
    predictor="gpu_predictor",
    objective="binary:logistic",
    max_depth=7,
    learning_rate=0.001,
    n_estimators=520,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
)

In [12]:
start_time = time.time()
gpu_xgb_model.fit(X_train.to_numpy(), y_train.to_numpy().ravel(), verbose=50)
end_tme = time.time()

Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [30]:
print(f"XGB Training Time: {end_tme - start_time} seconds")

XGB Training Time: 27.330517053604126 seconds


In [19]:
# export xgboost model
model_name = "xgboost_model.joblib"
joblib.dump(gpu_xgb_model, fitted_models_base + model_name)

['models/state_of_the_art/xgboost_model.joblib']

In [13]:
# make predictions
y_pred = gpu_xgb_model.predict(X_test.to_numpy())
y_pred_probability = gpu_xgb_model.predict_proba(X_test.to_numpy())

In [32]:
print(y_pred[:5])
print(y_pred_probability[:5])

[0 0 1 0 1]
[[0.59790254 0.4020975 ]
 [0.5186999  0.4813001 ]
 [0.30991864 0.69008136]
 [0.5139211  0.48607892]
 [0.39326686 0.60673314]]


In [33]:
gpu_xgb_accuracy = accuracy_score(y_test, y_pred)
print(f"XGB Test Accuracy: {gpu_xgb_accuracy:.4f}")

XGB Test Accuracy: 0.7989


## XGBoost Model Evaluation

In [14]:
# comprehensive XGBoost evaluation metrics
y_test_flat = y_test.to_numpy().ravel()
y_pred_proba_pos = y_pred_probability[:, 1]

print("="*50)
print("XGBOOST MODEL EVALUATION")
print("="*50)

# basic metrics
xgb_acc = evaluate_accuracy(y_test_flat, y_pred)
xgb_prec = evaluate_precision(y_test_flat, y_pred, average="binary")
xgb_rec = evaluate_recall(y_test_flat, y_pred, average="binary")
xgb_f1 = evaluate_f1_score(y_test_flat, y_pred, average="binary")

# auc metrics
xgb_auroc = roc_auc_score(y_test_flat, y_pred_proba_pos)
xgb_auprc = average_precision_score(y_test_flat, y_pred_proba_pos)

print(f"AUROC: {xgb_auroc:.4f}")
print(f"AUPRC: {xgb_auprc:.4f}")

print(f"\nSummary Metrics:")
print(f"  Accuracy:  {xgb_acc:.4f}")
print(f"  Precision: {xgb_prec:.4f}")
print(f"  Recall:    {xgb_rec:.4f}")
print(f"  F1 Score:  {xgb_f1:.4f}")
print(f"  AUROC:     {xgb_auroc:.4f}")
print(f"  AUPRC:     {xgb_auprc:.4f}")

XGBOOST MODEL EVALUATION
Accuracy: 0.9348 (93.48%)
Precision (binary): 0.2744
Recall (binary): 0.5248
F1-Score (binary): 0.3604
AUROC: 0.8465
AUPRC: 0.4282

Summary Metrics:
  Accuracy:  0.9348
  Precision: 0.2744
  Recall:    0.5248
  F1 Score:  0.3604
  AUROC:     0.8465
  AUPRC:     0.4282
