In [1]:
import torch

from data import make_dataloader, make_dataset
from train_mlp import train, evaluate_model, calculate_metric
from models import MLP

## load dataset

In [2]:
data_path = "./data/creditcard.csv"

train_dataset, val_dataset, test_dataset = make_dataset(data_path, 
                                                        normalization=True, 
                                                        test_split_ratio=0.2)

batch_size = 64
train_dataloader, val_dataloader, test_dataloader = make_dataloader(batch_size, train_dataset, True), \
                                                        make_dataloader(batch_size, val_dataset, False), \
                                                            make_dataloader(batch_size, test_dataset, False),

## baseline w/o sampling

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

X_train, y_train = train_dataset.tensors[0].numpy(), train_dataset.tensors[1].numpy()
X_val, y_val = val_dataset.tensors[0].numpy(), val_dataset.tensors[1].numpy()
X_test, y_test = test_dataset.tensors[0].numpy(), test_dataset.tensors[1].numpy()

log_reg = LogisticRegression(penalty='l2', C=1.0, max_iter=100000, random_state=53,)
log_reg.fit(X_train, y_train)

y_train_pred = log_reg.predict(X_train)
y_val_pred = log_reg.predict(X_val)
y_test_pred = log_reg.predict(X_test)

print("Trainset\n", calculate_metric(None, None, 'cls_report', y_train, y_train_pred))
print("Valset\n", calculate_metric(None, None, 'cls_report', y_val, y_val_pred))
print("Testset\n", calculate_metric(None, None, 'cls_report', y_test, y_test_pred))

### XGBoost

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(
    n_estimators=3000,       # Number of boosting rounds
    max_depth=6,            # Maximum tree depth for base learners
    learning_rate=0.05,      # Boosting learning rate
    objective='binary:logistic',  # Binary classification
    eval_metric='aucpr',    # Evaluation metric AUC-PR
    use_label_encoder=False, # To avoid warnings in newer versions of xgboost
    n_jobs = 12,
    device = "cuda",
    verbosity = 1
)

xgb_model.fit(X_train, y_train)

y_train_pred = xgb_model.predict(X_train)
y_val_pred = xgb_model.predict(X_val)
y_test_pred = xgb_model.predict(X_test)

print("Trainset\n", calculate_metric(None, None, 'cls_report', y_train, y_train_pred))
print("Valset\n", calculate_metric(None, None, 'cls_report', y_val, y_val_pred))
print("Testset\n", calculate_metric(None, None, 'cls_report', y_test, y_test_pred))

### MLP

In [None]:
input_size = train_dataset.tensors[0].shape[1]
hidden_size = 256
num_layers = 4
dropout_rate = .3

lr = 3e-4
num_epochs = 20

exp_name = "baseline-mlp"

model = MLP(input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout_rate=dropout_rate,)

train_loss_hist, train_acc_hist, val_acc_hist, model = train(model, train_dataloader, val_dataloader, 
      num_epochs=num_epochs, optimizer=torch.optim.Adam(model.parameters(), lr=lr), 
      criterion=torch.nn.BCELoss(), seed=53, metric="f1", exp_name=exp_name )

evaluate_model(exp_name, model, train_loss_hist, train_acc_hist, val_acc_hist, test_dataloader, "cls_report")

## w/ re-sampling (SMOTE)

In [14]:
from imbalance import augment_trainset
a_smote_train_dataset = augment_trainset(train_dataset, method="smote")
a_smote_train_dataloader = make_dataloader(batch_size, a_smote_train_dataset, True)

original trainset distribution Counter({0.0: 226013, 1.0: 372})
after applying SMOTE: Counter({0.0: 226013, 1.0: 226013})


### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

X_train, y_train = a_smote_train_dataset.tensors[0].numpy(), a_smote_train_dataset.tensors[1].numpy()
X_val, y_val = val_dataset.tensors[0].numpy(), val_dataset.tensors[1].numpy()
X_test, y_test = test_dataset.tensors[0].numpy(), test_dataset.tensors[1].numpy()

log_reg = LogisticRegression(penalty='l2', C=1.0, max_iter=100000, random_state=53,)
log_reg.fit(X_train, y_train)

y_train_pred = log_reg.predict(X_train)
y_val_pred = log_reg.predict(X_val)
y_test_pred = log_reg.predict(X_test)

print("Trainset\n", calculate_metric(None, None, 'cls_report', y_train, y_train_pred))
print("Valset\n", calculate_metric(None, None, 'cls_report', y_val, y_val_pred))
print("Testset\n", calculate_metric(None, None, 'cls_report', y_test, y_test_pred))

### XGBoost

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(
    n_estimators=3000,       # Number of boosting rounds
    max_depth=6,            # Maximum tree depth for base learners
    learning_rate=0.05,      # Boosting learning rate
    objective='binary:logistic',  # Binary classification
    eval_metric='aucpr',    # Evaluation metric AUC-PR
    use_label_encoder=False, # To avoid warnings in newer versions of xgboost
    n_jobs = 12,
    device = "cuda",
    verbosity = 1
)

xgb_model.fit(X_train, y_train)

y_train_pred = xgb_model.predict(X_train)
y_val_pred = xgb_model.predict(X_val)
y_test_pred = xgb_model.predict(X_test)

print("Trainset\n", calculate_metric(None, None, 'cls_report', y_train, y_train_pred))
print("Valset\n", calculate_metric(None, None, 'cls_report', y_val, y_val_pred))
print("Testset\n", calculate_metric(None, None, 'cls_report', y_test, y_test_pred))

### MLP

In [None]:
input_size = 30
hidden_size = 256
num_layers = 4
dropout_rate = .3

lr = 3e-4
num_epochs = 20

exp_name = "smote+mlp"
model = MLP(input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout_rate=dropout_rate,)

train_loss_hist, train_acc_hist, val_acc_hist, model = train(model, a_smote_train_dataloader, val_dataloader, 
      num_epochs=num_epochs, optimizer=torch.optim.Adam(model.parameters(), lr=lr), 
      criterion=torch.nn.BCELoss(), seed=53, metric="f1", )

evaluate_model(exp_name, model, train_loss_hist, train_acc_hist, val_acc_hist, test_dataloader, "cls_report")

## w/ re-sampling (VAE)

In [16]:
from imbalance import augment_trainset
a_vae_train_dataset = augment_trainset(train_dataset, method="vae")
a_vae_train_dataloader = make_dataloader(batch_size, a_vae_train_dataset, True)

original trainset distribution Counter({0.0: 226013, 1.0: 372})
sampled 225641 # of fraud examples using VAE
after applying VAE: Counter({0.0: 226013, 1.0: 226013})


### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

X_train, y_train = a_vae_train_dataset.tensors[0].numpy(), a_vae_train_dataset.tensors[1].numpy()
X_val, y_val = val_dataset.tensors[0].numpy(), val_dataset.tensors[1].numpy()
X_test, y_test = test_dataset.tensors[0].numpy(), test_dataset.tensors[1].numpy()

log_reg = LogisticRegression(penalty='l2', C=1.0, max_iter=100000, random_state=53,)
log_reg.fit(X_train, y_train)

y_train_pred = log_reg.predict(X_train)
y_val_pred = log_reg.predict(X_val)
y_test_pred = log_reg.predict(X_test)

print("Trainset\n", calculate_metric(None, None, 'cls_report', y_train, y_train_pred))
print("Valset\n", calculate_metric(None, None, 'cls_report', y_val, y_val_pred))
print("Testset\n", calculate_metric(None, None, 'cls_report', y_test, y_test_pred))

### XGBoost

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(
    n_estimators=3000,       # Number of boosting rounds
    max_depth=6,            # Maximum tree depth for base learners
    learning_rate=0.05,      # Boosting learning rate
    objective='binary:logistic',  # Binary classification
    eval_metric='aucpr',    # Evaluation metric AUC-PR
    use_label_encoder=False, # To avoid warnings in newer versions of xgboost
    n_jobs = 12,
    device = "cuda",
    verbosity = 1
)

xgb_model.fit(X_train, y_train)

y_train_pred = xgb_model.predict(X_train)
y_val_pred = xgb_model.predict(X_val)
y_test_pred = xgb_model.predict(X_test)

print("Trainset\n", calculate_metric(None, None, 'cls_report', y_train, y_train_pred))
print("Valset\n", calculate_metric(None, None, 'cls_report', y_val, y_val_pred))
print("Testset\n", calculate_metric(None, None, 'cls_report', y_test, y_test_pred))

### MLP

In [None]:
input_size = 30
hidden_size = 256
num_layers = 4
dropout_rate = .3

lr = 3e-4
num_epochs = 20

exp_name = "vae+mlp"
model = MLP(input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout_rate=dropout_rate,)

train_loss_hist, train_acc_hist, val_acc_hist, model = train(model, a_vae_train_dataloader, val_dataloader, 
      num_epochs=num_epochs, optimizer=torch.optim.Adam(model.parameters(), lr=lr), 
      criterion=torch.nn.BCELoss(), seed=53, metric="f1", )

evaluate_model(exp_name, model, train_loss_hist, train_acc_hist, val_acc_hist, test_dataloader, "cls_report")