In [1]:
import lightgbm as lgb
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
!pip install optuna

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import sys
sys.path.append('/content/drive/MyDrive/Freshman/UROP/Transformer')

import data_preprocessing

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.2-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.2 colorlog-6.9.0 optuna-4.4.0
Mounted at /content/drive


In [None]:
data_raw = pd.read_csv('/content/drive/MyDrive/Freshman/UROP/fraud_detection-main/credit_card_transactions.csv')

train_data_raw = data_raw.sample(frac=0.4, random_state=42)

test_data_raw = data_raw.drop(train_data_raw.index)
val_data_raw = train_data_raw.sample(frac=0.5, random_state=42)
train_data_raw = train_data_raw.drop(val_data_raw.index)

print(train_data_raw.shape)
print(test_data_raw.shape)
print(val_data_raw.shape)

(259335, 24)
(778005, 24)
(259335, 24)


Data Preprocessing

In [None]:
from data_preprocessing import data_preprocessing

train_data_processed, val_data_processed, test_data_processed = data_preprocessing(train_data_raw.sort_values(by = ["cc_num"])[:2000], val_data_raw.sort_values(by = ["cc_num"])[:2000], test_data_raw.sort_values(by = ["cc_num"])[:10000])

train_data_processed = train_data_processed[train_data_processed['is_fraud'] == 0]
val_data_processed = val_data_processed[val_data_processed['is_fraud'] == 0]

In [None]:
from sklearn.preprocessing import StandardScaler

train_data = train_data_processed.drop(['lat', 'long', 'city_pop', 'state', 'gender', 'Year'], axis=1)
val_data = val_data_processed.drop(['lat', 'long', 'city_pop', 'state', 'gender', 'Year'], axis=1)
test_data = test_data_processed.drop(['lat', 'long', 'city_pop', 'state', 'gender', 'Year'], axis=1)

for feature in ['amt', 'merch_lat', 'merch_long', 'LatLong_Dist']:
  train_data[feature] = StandardScaler().fit_transform(train_data[feature].values.reshape(-1, 1))
  val_data[feature] = StandardScaler().fit_transform(val_data[feature].values.reshape(-1, 1))
  test_data[feature] = StandardScaler().fit_transform(test_data[feature].values.reshape(-1, 1))

for feature in ['category', 'Hour', 'Minute', 'Second', 'dayOfWeek']:
  train_data[feature] += 1
  val_data[feature] += 1
  test_data[feature] += 1

column_order = ['cc_num', 'is_fraud', 'category', 'Month', 'Day', 'Hour', 'Minute', 'Second', 'dayOfWeek', 'amt', 'merch_lat', 'merch_long', 'LatLong_Dist']
train_data = train_data[column_order]
val_data = val_data[column_order]
test_data = test_data[column_order]

In [None]:
train_data.to_csv('/content/drive/MyDrive/train_data_transformer.csv', index=False)
val_data.to_csv('/content/drive/MyDrive/val_data_transformer.csv', index=False)
test_data.to_csv('/content/drive/MyDrive/test_data_transformer.csv', index=False)

In [None]:
train_data.head()

Unnamed: 0,cc_num,is_fraud,category,Month,Day,Hour,Minute,Second,dayOfWeek,amt,merch_lat,merch_long,LatLong_Dist
1017,60416207185,0,1,1,1,13,48,16,2,-0.109544,1.588864,-0.853192,1.782398
2726,60416207185,0,2,1,2,9,48,37,3,0.012037,1.340656,-0.812885,-1.863295
2907,60416207185,0,3,1,2,14,11,47,3,-0.077186,1.428932,-0.739323,-0.057116
10739,60416207185,0,4,1,7,13,59,20,1,0.210423,1.115036,-0.762164,1.316437
19025,60416207185,0,5,1,12,15,59,15,6,-0.001339,1.211158,-0.782055,-0.360465


In [None]:
val_data.head()

Unnamed: 0,cc_num,is_fraud,category,Month,Day,Hour,Minute,Second,dayOfWeek,amt,merch_lat,merch_long,LatLong_Dist
5467,60416207185,0,1,1,4,14,60,56,5,0.41433,1.455639,-0.774439,-0.617989
7473,60416207185,0,2,1,5,22,35,21,6,-0.39462,1.406294,-0.815467,-2.178093
8351,60416207185,0,3,1,6,11,26,50,7,0.308711,1.167343,-0.778663,0.888095
14749,60416207185,0,3,1,9,10,2,11,3,0.120127,1.552227,-0.823285,0.122275
21577,60416207185,0,2,1,13,23,55,42,7,-0.407678,1.601177,-0.832087,0.935319


Transformer Class

In [None]:
!pip install torch



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, num_heads, dropout=0.1):
    # the model dimension d_model must be divisible by num_heads
    super(MultiHeadAttention, self).__init__()
    self.d_model = d_model
    self.num_heads = num_heads
    self.d_k = d_model // num_heads # (d_model * num_features // num_heads)

    self.W_q = nn.Linear(d_model, d_model) # Query transformation
    self.W_k = nn.Linear(d_model, d_model) # Key transformation
    self.W_v = nn.Linear(d_model, d_model) # Value transformation
    self.W_o = nn.Linear(d_model, d_model) # Output transformation

    self.dropout = nn.Dropout(dropout)

  def scaled_dot_product_attention(self, Q, K, V, mask=None):
    # Calculate attention scores
    attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k) # (batch_size, num_heads, seq_length, seq_length)
    if mask is not None:
      attn_scores = attn_scores.masked_fill(mask == 0, -1e9) # (batch_size, num_heads, seq_length, seq_length)
    attn_probs = torch.softmax(attn_scores, dim=-1) # (batch_size, num_heads, seq_length, seq_length)
    attn_probs = self.dropout(attn_probs)
    output = torch.matmul(attn_probs, V) # (batch_size, num_heads, seq_length, d_k)
    return output

  def split_heads(self, x):
    # Reshape the input to have num_heads for multi-head attention
    batch_size, seq_length, d_model = x.size()
    return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

  def combine_heads(self, x):
    # Combine the multiple heads back to original shape
    batch_size, _, seq_length, d_k = x.size()
    return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

  def forward(self, Q, K, V, mask=None):
    Q = self.split_heads(self.W_q(Q)) # (batch_size, num_heads, seq_length, d_k)
    K = self.split_heads(self.W_k(K))
    V = self.split_heads(self.W_v(V))

    attn_output = self.scaled_dot_product_attention(Q, K, V, mask)

    output = self.W_o(self.combine_heads(attn_output)) # (batch_size, seq_length, d_model*num_features)
    return output

In [None]:
class PositionWiseFeedForward(nn.Module):
  def __init__(self, d_model, d_ff, dropout=0.1):
    super(PositionWiseFeedForward, self).__init__()
    self.fc1 = nn.Linear(d_model, d_ff)
    self.fc2 = nn.Linear(d_ff, d_model)
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    return self.fc2(self.dropout(self.relu(self.fc1(x))))

In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self, max_seq_length, d_model):
    # could maybe include time features in positional encoding
    super(PositionalEncoding, self).__init__()

    pe = torch.zeros(max_seq_length, d_model) # tensor filled with 0's that will be populated with positional encodings
    position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)

    self.register_buffer('pe', pe.unsqueeze(0))

  def forward(self, x):
    return x + self.pe[:, :x.size(1)]

In [None]:
class DecoderLayer(nn.Module):
  def __init__(self, d_model, num_heads, d_ff, dropout):
    super(DecoderLayer, self).__init__()
    self.self_attn = MultiHeadAttention(d_model, num_heads)
    self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
    self.norm1 = nn.LayerNorm(d_model)
    self.norm2 = nn.LayerNorm(d_model)
    self.dropout1 = nn.Dropout(dropout)
    self.dropout2 = nn.Dropout(dropout)

  def forward(self, x, tgt_mask):
    attn_output = self.self_attn(x, x, x, tgt_mask)
    x = x + self.dropout1(attn_output)
    x = self.norm1(x)

    ff_output = self.feed_forward(x)
    x = x + self.dropout2(ff_output)
    x = self.norm2(x)

    return x

In [None]:
class Transformer(nn.Module):
  def __init__(self, categorical, numeric, num_layers, max_sequence_length, d_model=64, field_depth=1, num_heads=4, dim_feedforward=128, dropout=0.1):
    # d_model is the dimensionality of the input
    # categorical is the tuple of sizes of each categorical field
    # numeric is the number of numeric fields
    super(Transformer, self).__init__()
    self.categorical = categorical
    self.numeric = numeric

    self.num_categories = len(categorical)
    self.num_numeric = numeric
    self.total_fields = self.num_categories + self.num_numeric

    self.categorical_embedding = nn.ModuleList([nn.Embedding(c, d_model, padding_idx=0) for c in categorical])
    self.numeric_embedding = nn.ModuleList([nn.Linear(1, d_model) for _ in range(numeric)])
    self.positional_encoding = PositionalEncoding(max_sequence_length, d_model*self.total_fields) # (batch_size, seq_length, num_features * d_model)

    self.decoder_layers = nn.ModuleList([DecoderLayer(d_model*self.total_fields, num_heads, dim_feedforward, dropout) for _ in range(num_layers)])

    self.cat_output_heads = nn.ModuleList([nn.Linear(d_model*self.total_fields, c) for c in categorical])
    self.num_output_heads = nn.ModuleList([nn.Linear(d_model*self.total_fields, 1) for _ in range(numeric)])
    self.embed_pred = nn.Linear(d_model * self.total_fields, d_model * self.total_fields)

    self.dropout = nn.Dropout(dropout)

  def generate_mask(self, tgt):
    # returning nopeak mask instead of src, tgt, commented out unsqueeze
    tgt_mask = (tgt[:, 1] != 0).unsqueeze(0).unsqueeze(1).unsqueeze(3)
    seq_length = tgt.size(0)
    nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
    return nopeak_mask

  def forward(self, tgt):
    # categorical: Tensor of category indices
    # numeric: tensor of numeric values
    # return predicted embedding of next transaction for each categorical field

    tgt_mask = self.generate_mask(tgt)

    tgt_categorical = tgt[:, :self.num_categories].long()
    tgt_numeric = tgt[:, self.num_categories:self.num_categories + self.num_numeric + 1].float()

    tgt_categorical_embedded = torch.cat([self.categorical_embedding[i](tgt_categorical[:, i]) for i in range(self.num_categories)], dim=1) # (seq_length, num_categories*d_model)
    tgt_numeric_embedded = torch.cat([self.numeric_embedding[i](tgt_numeric[:, i].unsqueeze(-1)) for i in range(self.num_numeric)], dim=1) # (seq_length, num_numeric*d_model)

    tgt_embedded = self.dropout(self.positional_encoding(torch.cat([tgt_categorical_embedded, tgt_numeric_embedded], dim=1).unsqueeze(0))) # (batch_size, seq_length, num_features * d_model)
    seq_len = tgt.size(0)

    dec_output = tgt_embedded # (batch_size, seq_length, num_features * d_model)
    for dec_layer in self.decoder_layers:
      dec_output = dec_layer(dec_output, tgt_mask)

    # Use last transaction's output to predict the next one
    last_output = dec_output # (batch_size, seq_length, num_features * d_model)
    pred_next_emb = self.embed_pred(last_output) # (batch_size, seq_length, num_features * d_model)

    cat_preds = [head(last_output) for head in self.cat_output_heads]       # logits for each categorical field
    num_preds = [head(last_output) for head in self.num_output_heads]  # numeric values
    return pred_next_emb, cat_preds, num_preds

In [None]:
# Get training, validation, and testing data

# Pad end with 0's
train_num_trans_per_user = []
for user in train_data['cc_num'].unique():
  train_num_trans_per_user.append(train_data[train_data['cc_num'] == user].shape[0])

train_max_len = max(train_num_trans_per_user)

val_num_trans_per_user = []
for user in val_data['cc_num'].unique():
  val_num_trans_per_user.append(val_data[val_data['cc_num'] == user].shape[0])

val_max_len = max(val_num_trans_per_user)

test_num_trans_per_user = []
for user in test_data['cc_num'].unique():
  test_num_trans_per_user.append(test_data[test_data['cc_num'] == user].shape[0])

test_max_len = max(test_num_trans_per_user)

max_len = max(train_max_len, val_max_len, test_max_len)

# Get training data

train_data_by_user = []
for user in train_data['cc_num'].unique():
  if train_data[train_data['cc_num'] == user].shape[0] > 1:
    user_data = train_data[train_data['cc_num'] == user].drop(['cc_num', 'is_fraud'], axis=1)
    user_data = np.append(user_data, np.zeros((max_len - user_data.shape[0], user_data.shape[1])), axis=0)
    train_data_by_user.append(torch.from_numpy(user_data))
train_data_by_user_tensor = torch.stack([user for user in train_data_by_user])

train_tgt_data = train_data_by_user_tensor

# Get validation data

val_data_by_user = []
for user in val_data['cc_num'].unique():
  if val_data[val_data['cc_num'] == user].shape[0] > 1:
    user_data = val_data[val_data['cc_num'] == user].drop(['cc_num', 'is_fraud'], axis=1)
    user_data = np.append(user_data, np.zeros((max_len - user_data.shape[0], user_data.shape[1])), axis=0)
    val_data_by_user.append(torch.from_numpy(user_data))
val_data_by_user_tensor = torch.stack([user for user in val_data_by_user])

val_tgt_data = val_data_by_user_tensor

# Get testing data

test_data_by_user = []
for user in test_data['cc_num'].unique():
  if test_data[test_data['cc_num'] == user].shape[0] > 1:
    test_user_data = test_data[test_data['cc_num'] == user].drop(['cc_num', 'is_fraud'], axis=1)
    test_user_data = np.append(test_user_data, np.zeros((max_len - test_user_data.shape[0], test_user_data.shape[1])), axis=0)
    test_data_by_user.append(torch.from_numpy(test_user_data))
test_data_by_user_tensor = torch.stack([user for user in test_data_by_user])

test_tgt_data = test_data_by_user_tensor

In [None]:
print(train_tgt_data.shape)
print(val_tgt_data.shape)
print(test_tgt_data.shape)

torch.Size([12, 1196, 11])
torch.Size([12, 1196, 11])
torch.Size([19, 1196, 11])


In [None]:
# Hyperparameter tuning
import optuna

# Define objective, then define trial where parameters can vary, then study.optimize

def get_best_params(train_data, val_data):
  trial_data = []

  def objective(trial):
    # Define parameters to optimize
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 1e-6, 1e-4, log=True),
        'dropout': trial.suggest_float('dropout', 0.1, 0.5),
        'num_layers': trial.suggest_int('num_layers', 1, 10),
        'd_model': trial.suggest_int('d_model', 16, 32),
        'num_heads': trial.suggest_int('num_heads', 1, 3),
    }

    # Train model
    np.random.seed(42)

    transformer = Transformer(categorical, numerical, params['num_layers'], max_len+1, params['d_model'] * params['num_heads'] * 2, 1, params['num_heads'] * 2, 128, params['dropout']) # parameters in the custom format for my transformer class
    criterion_cat = nn.CrossEntropyLoss()
    criterion_num = nn.MSELoss()
    optimizer = optim.Adam(transformer.parameters(), lr=params['learning_rate'])

    transformer.train()

    total_loss = 0
    for epoch in range(5): # increase number of epochs
      for user in range(len(train_data)): # increase number of users
        optimizer.zero_grad()

        train_input = train_tgt_data[user, :-1, :]  # (seq_len - 1, num_features)

        train_target = train_tgt_data[user, 1:, :]  # (seq_len - 1, num_features)

        _, preds_cat, preds_num = transformer(train_input)

        train_tgt_cat = train_target[:, :len(categorical)].long()  # (seq_len - 1, num_categorical)
        train_tgt_num = train_target[:, len(categorical):len(categorical) + numerical].float()  # (seq_len - 1, num_numeric)

        # Loss calculations
        cat_loss = sum(
            criterion_cat(pred.squeeze(0)[: train_num_trans_per_user[user]-1], train_tgt_cat[:, i][1: train_num_trans_per_user[user]].to(pred.device))
            for i, pred in enumerate(preds_cat)
        ) / len(preds_cat)

        num_loss = sum(
            criterion_num(pred.squeeze(0).squeeze(1)[: train_num_trans_per_user[user]-1], train_tgt_num[:, i][1: train_num_trans_per_user[user]])
            for i, pred in enumerate(preds_num)
        ) / len(preds_num)

        loss = cat_loss * 0.85 + num_loss * 0.15
        loss.backward()
        optimizer.step()

    # Validation set
    transformer.eval()
    total_val_loss = 0
    with torch.no_grad():
      for epoch in range(5):
        for user in range(len(val_data)):
          val_input = val_tgt_data[user, :-1, :]
          val_target = val_tgt_data[user, 1:, :]

          _, preds_cat, preds_num = transformer(val_input)

          tgt_cat = val_target[:, :len(categorical)].long()
          tgt_num = val_target[:, len(categorical):len(categorical) + numerical].float()

          cat_loss = sum(
              criterion_cat(pred.squeeze(0)[: val_num_trans_per_user[user]-1],
                            tgt_cat[:, i][1: val_num_trans_per_user[user]].to(pred.device))
              for i, pred in enumerate(preds_cat)
          ) / len(preds_cat)

          num_loss = sum(
              criterion_num(pred.squeeze(0).squeeze(1)[: val_num_trans_per_user[user]-1],
                            tgt_num[:, i][1: val_num_trans_per_user[user]])
              for i, pred in enumerate(preds_num)
          ) / len(preds_num)

          val_loss = cat_loss * 0.85 + num_loss * 0.15
          total_val_loss += val_loss.item()
    avg_val_loss = total_val_loss / len(val_data)
    return avg_val_loss

  study = optuna.create_study(direction='minimize')
  study.optimize(objective, n_trials=50, show_progress_bar=True)

  return study.best_params


In [None]:
categorical = (15, 13, 32, 25, 61, 61, 8) # (category, Month, Day, Hour, Minute, Second, dayOfWeek)
numerical = 4 # (amt, merch_lat, merch_long, LatLong_Dist, hour_amt, category_amt)
num_layers = 6

best_params = get_best_params(train_data_by_user, val_data_by_user)
print(best_params)

[I 2025-06-26 03:55:18,710] A new study created in memory with name: no-name-de38f04a-cc3c-467e-a2ee-e30ae4d9782a


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2025-06-26 03:56:22,502] Trial 0 finished with value: 2.9311333894729614 and parameters: {'learning_rate': 4.7869417249726674e-06, 'dropout': 0.40108394699124794, 'num_layers': 5}. Best is trial 0 with value: 2.9311333894729614.
[I 2025-06-26 03:57:58,615] Trial 1 finished with value: 2.779240608215332 and parameters: {'learning_rate': 5.421429472683481e-05, 'dropout': 0.17738259429868727, 'num_layers': 10}. Best is trial 1 with value: 2.779240608215332.
[I 2025-06-26 03:58:45,472] Trial 2 finished with value: 2.8234022855758667 and parameters: {'learning_rate': 6.585312189797427e-05, 'dropout': 0.2623716056541126, 'num_layers': 5}. Best is trial 1 with value: 2.779240608215332.
[I 2025-06-26 03:59:30,042] Trial 3 finished with value: 2.7875622510910034 and parameters: {'learning_rate': 3.8137396270307906e-05, 'dropout': 0.1955860629048167, 'num_layers': 5}. Best is trial 1 with value: 2.779240608215332.
[I 2025-06-26 04:01:02,012] Trial 4 finished with value: 2.8191639184951782 and

In [None]:
import statistics

# Define transformer model

transformer = Transformer(categorical, numerical, best_params['num_layers'], max_len+1, best_params['d_model'] * best_params['num_heads'] * 2, 1, best_params['num_heads'] * 2, 128, best_params['dropout']) # parameters in the custom format for my transformer class

criterion_cat = nn.CrossEntropyLoss()
criterion_num = nn.MSELoss()
optimizer = optim.Adam(transformer.parameters(), lr=0.0001)

transformer.train()

train_preds_cat = []
train_preds_num = []

for epoch in range(200):
  loss_avg, cat_loss_avg, num_loss_avg = 0, 0, 0
  for user in range(len(train_data_by_user)):
    optimizer.zero_grad()

    train_input = train_tgt_data[user, :-1, :]  # (seq_len - 1, num_features)

    train_target = train_tgt_data[user, 1:, :]  # (seq_len - 1, num_features)

    _, preds_cat, preds_num = transformer(train_input)

    train_tgt_cat = train_target[:, :len(categorical)].long()  # (seq_len - 1, num_categorical)
    train_tgt_num = train_target[:, len(categorical):len(categorical) + numerical].float()  # (seq_len - 1, num_numeric)

    # Loss calculations
    cat_loss = sum(
        criterion_cat(pred.squeeze(0)[: train_num_trans_per_user[user]-1], train_tgt_cat[:, i][1: train_num_trans_per_user[user]].to(pred.device))
        for i, pred in enumerate(preds_cat)
    ) / len(preds_cat)

    num_loss = sum(
        criterion_num(pred.squeeze(0).squeeze(1)[: train_num_trans_per_user[user]-1], train_tgt_num[:, i][1: train_num_trans_per_user[user]])
        for i, pred in enumerate(preds_num)
    ) / len(preds_num)

    loss = cat_loss * 0.85 + num_loss * 0.15
    loss.backward()
    optimizer.step()
    loss_avg += loss.item()
    cat_loss_avg += cat_loss.item()
    num_loss_avg += num_loss.item()

    if epoch % 10 == 0:
      torch.save(transformer.state_dict(), '/content/drive/MyDrive/transformer_model_progress.pth')
  print('Epoch', epoch+1, 'Loss', loss_avg / len(train_data_by_user), 'Cat loss', cat_loss_avg / len(train_data_by_user), 'Num loss', num_loss_avg / len(train_data_by_user))

Epoch 1 Loss 3.1493142445882163 Cat loss 3.230862617492676 Num loss 2.687206486860911
Epoch 2 Loss 2.853737771511078 Cat loss 3.14112655321757 Num loss 1.2252010305722554
Epoch 3 Loss 2.8391650120417276 Cat loss 3.134147266546885 Num loss 1.1675981680552165
Epoch 4 Loss 2.81198787689209 Cat loss 3.1283488074938455 Num loss 1.0192754417657852
Epoch 5 Loss 2.794614017009735 Cat loss 3.129445433616638 Num loss 0.8972354084253311
Epoch 6 Loss 2.782039523124695 Cat loss 3.127896249294281 Num loss 0.8221841802199682
Epoch 7 Loss 2.7890386382738748 Cat loss 3.121378481388092 Num loss 0.9057790612181028
Epoch 8 Loss 2.7715552051862082 Cat loss 3.117199202378591 Num loss 0.812905435760816
Epoch 9 Loss 2.7804433703422546 Cat loss 3.117781857649485 Num loss 0.8688584268093109
Epoch 10 Loss 2.764886995156606 Cat loss 3.1155548691749573 Num loss 0.777768594523271
Epoch 11 Loss 2.773742993672689 Cat loss 3.1102458238601685 Num loss 0.8668930754065514
Epoch 12 Loss 2.790155510107676 Cat loss 3.103774

In [None]:
from sklearn.metrics import accuracy_score, mean_squared_error
import torch.nn.functional as F

# Get true values
train_tgt_cat = train_tgt_data[-1, :, :][:, :7].long()
train_tgt_num = train_tgt_data[-1, :, :][:, 7:12].float()

all_cat_preds, all_cat_labels = [[] for _ in categorical], [[] for _ in categorical]
all_num_preds, all_num_labels = [[] for _ in range(5)], [[] for _ in range(4)]

# Categorical predictions
for i, pred in enumerate(preds_cat):
  pred_labels = F.softmax(pred.squeeze(0), dim=1).argmax(dim=1).cpu().tolist()
  true_labels = train_tgt_cat[:, i].cpu().tolist()
  all_cat_preds[i].extend(pred_labels)
  all_cat_labels[i].extend(true_labels)

# Numeric predictions
for i, pred in enumerate(preds_num):
  all_num_preds[i].extend(pred.squeeze(0).squeeze(1).cpu().tolist())
  all_num_labels[i].extend(train_tgt_num[:, i].cpu().tolist())

cat_accuracies = [accuracy_score(true[:-1][1: train_num_trans_per_user[-1]], pred[:-1][: train_num_trans_per_user[-1]-1]) for true, pred in zip(all_cat_labels, all_cat_preds)]

num_rmse = [mean_squared_error(true[:-1][1: train_num_trans_per_user[-1]], pred[:-1][: train_num_trans_per_user[-1]-1]) for true, pred in zip(all_num_labels, all_num_preds)]

categories = ['Category', 'Month', 'Day', 'Hour', 'Minute', 'Second', 'dayOfWeek']
for i in range(7):
  print(categories[i], 'accuracy:', str(cat_accuracies[i] * 100) + '%')

nums = ['Amount', 'Merch Lat', 'Merch Long', 'LatLong_Dist']
for i in range(4):
  print(nums[i], 'MSE:', str(num_rmse[i]))

Category accuracy: 17.857142857142858%
Month accuracy: 60.71428571428571%
Day accuracy: 3.571428571428571%
Hour accuracy: 3.571428571428571%
Minute accuracy: 0.0%
Second accuracy: 0.0%
dayOfWeek accuracy: 14.285714285714285%
Amount MSE: 0.10647072415194607
Merch Lat MSE: 0.11120829324283296
Merch Long MSE: 0.07143056032701812
LatLong_Dist MSE: 2.1030707051071924


In [None]:
import statistics
print('Categorical predictions')
for i, pred in enumerate(preds_cat):
  print('PRED', F.softmax(pred.squeeze(0), dim=1)[: train_num_trans_per_user[-1]-1].argmax(dim=1).cpu().tolist())
  print('TRUE', train_tgt_cat[:, i][1: train_num_trans_per_user[-1]].cpu().tolist())
  print(" ")
print('Numerical predictions')
for i, pred in enumerate(preds_num):
  print('PRED', pred.squeeze(0)[: train_num_trans_per_user[-1]-1].squeeze(1).cpu().tolist())
  print('TRUE', train_tgt_num[:, i][1: train_num_trans_per_user[-1]].cpu().tolist())
  print(" ")

Categorical predictions
PRED [10, 8, 8, 7, 11, 5, 5, 6, 8, 3, 11, 14, 11, 7, 2, 2, 3, 6, 14, 1, 8, 7, 9, 13, 13, 3, 3, 0]
TRUE [2, 10, 8, 8, 7, 11, 5, 5, 6, 8, 3, 11, 14, 11, 7, 2, 2, 3, 6, 14, 1, 8, 7, 9, 13, 13, 3, 3]
 
PRED [4, 4, 5, 5, 5, 5, 6, 6, 6, 8, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 12, 2, 2, 2, 4, 0]
TRUE [3, 4, 4, 5, 5, 5, 5, 6, 6, 6, 8, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 12, 2, 2, 2, 4]
 
PRED [13, 20, 12, 14, 18, 23, 13, 18, 23, 26, 27, 28, 2, 8, 14, 14, 21, 22, 25, 7, 9, 19, 23, 7, 13, 27, 28, 0]
TRUE [28, 13, 20, 12, 14, 18, 23, 13, 18, 23, 26, 27, 28, 2, 8, 14, 14, 21, 22, 25, 7, 9, 19, 23, 7, 13, 27, 28]
 
PRED [13, 19, 17, 5, 13, 20, 6, 11, 24, 14, 19, 21, 23, 7, 1, 3, 15, 10, 23, 12, 22, 7, 1, 3, 12, 24, 24, 0]
TRUE [7, 13, 19, 17, 5, 13, 20, 6, 11, 24, 14, 19, 21, 23, 7, 1, 3, 15, 10, 23, 12, 22, 7, 1, 3, 12, 24, 24]
 
PRED [13, 5, 18, 9, 35, 11, 18, 56, 10, 46, 59, 52, 46, 16, 20, 4, 26, 30, 23, 33, 22, 28, 25, 24, 13, 11, 54, 0]
TRUE [29, 13,

### Evaluate Model on Test Data

In [None]:
test_preds_cat = [] # List of 25 lists of probabilities for each of the 7 categorical features (1, 2048, 15), (1, 2048, 13), etc for all the transactions of a single user
test_preds_num = [] # List of 25 lists of predictions for each of the 4 numeric features (1, 2048, 1) for all the transactions of a single user
transformer.eval()
with torch.no_grad():
  for user in range(len(test_tgt_data)):
    pred_next_emb, preds_cat, preds_num = transformer(test_tgt_data[user, :-1, :])
    test_preds_cat.append(preds_cat)
    test_preds_num.append(preds_num)

In [None]:
import torch.nn.functional as F

preds_cat_tensor = []
for user in range(len(test_preds_cat)): # want to make a tensor with 15 columns and ~25 * 2048 rows for the categorical probabilities for all users
  preds_for_user = test_preds_cat[user]
  preds_for_user_concat = torch.cat([F.softmax(pred, dim=2).argmax(dim=2).cpu() for pred in preds_for_user], dim=0) # (2048 rows, 4 columns)
  preds_for_user_concat = torch.transpose(preds_for_user_concat, 0, 1)[:test_num_trans_per_user[user], :]
  preds_cat_tensor.append(preds_for_user_concat)

preds_cat_tensor = torch.concat(preds_cat_tensor, dim=0)
print(preds_cat_tensor.shape)

torch.Size([9999, 7])


In [None]:
preds_num_tensor = []
for user in range(len(test_preds_num)): # want to make a tensor with 4 columns and ~25 * 2048 rows for the numerical predictions for all users
  preds_for_user = test_preds_num[user]
  preds_for_user_concat = torch.cat([pred.squeeze(0)[:test_num_trans_per_user[user], :] for pred in preds_for_user], dim=1) # (2048 rows, 4 columns)
  preds_num_tensor.append(preds_for_user_concat)

preds_num_tensor = torch.concat(preds_num_tensor, dim=0)
print(preds_num_tensor.shape)

torch.Size([9999, 4])


In [None]:
preds_tensor = torch.cat([preds_cat_tensor, preds_num_tensor], dim=1)
print(preds_tensor.shape)

torch.Size([9999, 11])


In [None]:
preds_df = pd.DataFrame(preds_tensor.numpy(), columns=test_data.columns.drop(['cc_num', 'is_fraud']))

preds_df['is_fraud'] = test_data.reset_index(drop=True)['is_fraud']
preds_df['cc_num'] = test_data.reset_index(drop=True)['cc_num']

In [None]:
preds_df.head()

Unnamed: 0,category,Month,Day,Hour,Minute,Second,dayOfWeek,amt,merch_lat,merch_long,LatLong_Dist,is_fraud,cc_num
0,13.0,1.0,18.0,12.0,56.0,20.0,6.0,-0.362873,0.612886,-0.281003,-0.781281,0,60416207185
1,13.0,1.0,10.0,12.0,56.0,20.0,4.0,-0.493947,0.819552,-0.304294,-0.062741,0,60416207185
2,2.0,1.0,24.0,12.0,36.0,15.0,4.0,-0.280545,0.938696,-0.469574,0.285192,0,60416207185
3,2.0,1.0,24.0,3.0,36.0,15.0,2.0,-0.294404,1.074827,-0.523021,0.777782,0,60416207185
4,2.0,1.0,24.0,3.0,36.0,15.0,4.0,0.044283,1.059027,-0.566687,0.652886,0,60416207185


In [None]:
torch.save(transformer.state_dict(), '/content/drive/MyDrive/transformer_model.pth')

In [None]:
preds_df.to_csv('/content/drive/MyDrive/preds_df_transformer.csv', index=False)