# This notebook is dedicated to the state-of-the art model implementation of the Research Project
### MSC/DSA/134

In [1]:
# import required packages
from sklearn.svm import SVC
import time
import numpy as np
import pandas as pd
from globals.pandas_functions import *
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#### State-of-the-art Models to be implemented:
- SVM
- XGBoost

## SVM Model Implementation

In [2]:
# load preprocessed training and testing data
data_base_path = "data/processed/null_value_option_1/scaled_and_balanced"

X_train = pd.read_csv(f"{data_base_path}/pca_selected_features/unified_transaction_data_option1_x_train_pca.csv")
X_test = pd.read_csv(f"{data_base_path}/pca_selected_features/unified_transaction_data_option1_x_test_pca.csv")
y_train = pd.read_csv(f"{data_base_path}/unified_transaction_data_option1_y_train_balanced.csv")
y_test = pd.read_csv(f"{data_base_path}/unified_transaction_data_option1_y_test.csv")

In [9]:
sample_size = 100000

In [3]:
dataset_dimension("X_train", X_train)
dataset_dimension("X_test", X_test)

X_train dataset dimension: (911764, 11)
X_test dataset dimension: (118102, 11)


model model parameters are inspired from the literature review <br>
set of model parameters are defined in this Google Sheet: https://docs.google.com/spreadsheets/d/17DAOxBz-xashyfk6qFNaAYGm2XN98F8GiNUozh2BRlc/edit?usp=sharing

In [8]:
# define and train SVM model
max_iter = 1000000
svm_model = SVC(kernel="linear", C=1.0, random_state=42, max_iter=max_iter, verbose=1)


start_time = time.time()
svm_model.fit(X_train[:sample_size].to_numpy(), y_train[:sample_size].to_numpy().ravel())
end_tme = time.time()

NameError: name 'sample_size' is not defined

In [None]:
# TODO: Evaluate SVM model

## XGB Model Implementation

In [28]:
gpu_xgb_model = xgb.XGBClassifier(
    tree_method="hist",
    predictor="gpu_predictor",
    objective="binary:logistic",
    max_depth=7,
    learning_rate=0.001,
    n_estimators=520,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
)

In [29]:
start_time = time.time()
gpu_xgb_model.fit(X_train.to_numpy(), y_train.to_numpy().ravel(), verbose=50)
end_tme = time.time()

Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [30]:
print(f"XGB Training Time: {end_tme - start_time} seconds")

XGB Training Time: 27.330517053604126 seconds


In [31]:
# make predictions
y_pred = gpu_xgb_model.predict(X_test.to_numpy())
y_pred_probability = gpu_xgb_model.predict_proba(X_test.to_numpy())

In [32]:
print(y_pred[:5])
print(y_pred_probability[:5])

[0 0 1 0 1]
[[0.59790254 0.4020975 ]
 [0.5186999  0.4813001 ]
 [0.30991864 0.69008136]
 [0.5139211  0.48607892]
 [0.39326686 0.60673314]]


In [33]:
gpu_xgb_accuracy = accuracy_score(y_test, y_pred)
print(f"XGB Test Accuracy: {gpu_xgb_accuracy:.4f}")

XGB Test Accuracy: 0.7989
