In [1]:
import torch
import datetime
import time
import wandb

In [2]:
%run ../shared_functions.py

In [3]:
%run ../my_shared_functions.py

In [4]:
DIR_INPUT = '../../fraud-detection-handbook/simulated-data-transformed/data/'

BEGIN_DATE = "2018-06-11"
END_DATE = "2018-09-14"

print("Load  files")
%time transactions_df=read_from_files(DIR_INPUT, BEGIN_DATE, END_DATE)
print("{0} transactions loaded, containing {1} fraudulent transactions".format(len(transactions_df),transactions_df.TX_FRAUD.sum()))

output_feature="TX_FRAUD"

input_features=['TX_AMOUNT','TX_DURING_WEEKEND', 'TX_DURING_NIGHT', 'CUSTOMER_ID_NB_TX_1DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW', 'CUSTOMER_ID_NB_TX_7DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW', 'CUSTOMER_ID_NB_TX_30DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW', 'TERMINAL_ID_NB_TX_1DAY_WINDOW',
       'TERMINAL_ID_RISK_1DAY_WINDOW', 'TERMINAL_ID_NB_TX_7DAY_WINDOW',
       'TERMINAL_ID_RISK_7DAY_WINDOW', 'TERMINAL_ID_NB_TX_30DAY_WINDOW',
       'TERMINAL_ID_RISK_30DAY_WINDOW']

Load  files
CPU times: total: 453 ms
Wall time: 444 ms
919767 transactions loaded, containing 8195 fraudulent transactions


In [5]:
SEED = 42
seed_everything(SEED)

start_date_training = datetime.datetime.strptime("2018-07-25", "%Y-%m-%d")
delta_train=7
delta_delay=7
delta_test=7
delta_valid = delta_test

start_date_training_with_valid = start_date_training+datetime.timedelta(days=-(delta_delay+delta_valid))

(train_df, valid_df)=get_train_test_set(transactions_df,start_date_training+datetime.timedelta(days=delta_train+delta_delay),
                                       delta_train=delta_train,delta_delay=delta_delay,delta_test=delta_test)

(train_df, valid_df)=scaleData(train_df, valid_df, input_features)

if torch.cuda.is_available():
    DEVICE = "cuda" 
else:
    DEVICE = "cpu"
print("Selected device is",DEVICE)

Selected device is cuda


In [7]:
x_train = torch.FloatTensor(train_df[input_features].values)
x_valid = torch.FloatTensor(valid_df[input_features].values)
y_train = torch.FloatTensor(train_df[output_feature].values)
y_valid = torch.FloatTensor(valid_df[output_feature].values)

training_set = FraudDataset(x_train.to(DEVICE), y_train.to(DEVICE))
valid_set = FraudDataset(x_valid.to(DEVICE), y_valid.to(DEVICE))

training_generator,valid_generator = prepare_generators(training_set,valid_set,batch_size=64)

In [6]:
config_mlp = dict(
    dataset_id = 'fraud-detection-handbook-transformed',
    validation = 'train test split',
    seed = SEED,
    begin_date = '2018-08-08',
    delta_train = 7,
    delta_delay = 7,
    delta_test = 7,
    batch_size=64,
    num_workers=0,
    hidden_size = 1000,
    optimizer='adam',
    lr=0.0001,
    early_stopping=True,
    early_stopping_patience=2,
    max_epochs=100,
    scale=True,
    criterion='bce'
)
wandb.init(project="mgr-anomaly-tsxai-project", config=config_mlp, tags=['mlp', 'imbalance-not-considered', 'constant_lr_0_0001'])
config_mlp = wandb.config

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mchamera[0m ([33mmgr-anomaly-tsxai[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016916666666687282, max=1.0…

In [9]:
model = SimpleFraudMLP(len(input_features), config_mlp.hidden_size).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr = config_mlp.lr)
criterion = torch.nn.BCELoss().to(DEVICE)
model,training_execution_time,train_losses_adam,valid_losses_adam = training_loop_and_saving_best_wandb(model,training_generator,valid_generator,optimizer,criterion,verbose=True,
                                                                        save_path='../models/DL/mlp_adam/simple_mlp_model_adam_lr_0_0001.pt')

wandb.log({'Training execution time': training_execution_time})
print(training_execution_time)
start_time=time.time()
# no need to set model in eval mode since there are no BN, Dropout layers
predictions_test = model(x_valid.to(DEVICE))
prediction_execution_time=time.time()-start_time
wandb.log({'Prediction execution time': prediction_execution_time})
print(prediction_execution_time)

predictions_df=valid_df
predictions_df['predictions']=predictions_test.detach().cpu().numpy()
    
performance_df = performance_assessment_f1_included(predictions_df, top_k_list=[100])

wandb.log({'AUC ROC': performance_df.loc[0,'AUC ROC']})
wandb.log({'Average precision': performance_df.loc[0,'Average precision']})
wandb.log({'F1 score': performance_df.loc[0,'F1 score']})
wandb.log({'Card Precision@100': performance_df.loc[0,'Card Precision@100']})

mlp_artifact = wandb.Artifact('mlp_adam_lr_0_0001', type='mlp', description='trained simple multilayer perceptron with 1 hidden layer and adam optimizer, lr=0.0001')
mlp_artifact.add_dir('../models/DL/mlp_adam')
wandb.log_artifact(mlp_artifact)
wandb.finish()


Epoch 0: train loss: 0.09847772872302721
valid loss: 0.04099373452839229
New best score: 0.04099373452839229

Epoch 1: train loss: 0.03530663097529425
valid loss: 0.02798019776022748
New best score: 0.02798019776022748

Epoch 2: train loss: 0.027268765705306477
valid loss: 0.023891059650882278
New best score: 0.023891059650882278

Epoch 3: train loss: 0.024753110152354076
valid loss: 0.022424466867896467
New best score: 0.022424466867896467

Epoch 4: train loss: 0.023589085577343464
valid loss: 0.02170663549571927
New best score: 0.02170663549571927

Epoch 5: train loss: 0.022813489372079377
valid loss: 0.021437506668709767
New best score: 0.021437506668709767

Epoch 6: train loss: 0.022218593811745922
valid loss: 0.021000493722998437
New best score: 0.021000493722998437

Epoch 7: train loss: 0.021828248174758973
valid loss: 0.020933409555015572
New best score: 0.020933409555015572

Epoch 8: train loss: 0.021417166018841683
valid loss: 0.021208055898202013
1  iterations since best sco

[34m[1mwandb[0m: Adding directory to artifact (.\..\models\DL\mlp_adam)... Done. 0.0s


0,1
AUC ROC,▁
Average precision,▁
Card Precision@100,▁
F1 score,▁
Prediction execution time,▁
Training execution time,▁
train loss,█▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val loss,█▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
AUC ROC,0.889
Average precision,0.661
Card Precision@100,0.303
F1 score,0.646
Prediction execution time,0.005
Training execution time,98.318
train loss,0.01866
val loss,0.01926
