# Joshua Placidi - Free Agent

## Imports and Data loading

In [1]:
%reload_ext lab_black
import pandas as pd

pd.options.mode.chained_assignment = None
import altair as alt
from tqdm.notebook import tqdm

In [2]:
# Read in datasets
features_df = pd.read_csv("./bank_transactions_dataset/bank_transaction_features.csv")
labels_df = pd.read_csv("./bank_transactions_dataset/bank_transaction_labels.csv")

# Combine labels and features into one df
combined_df = pd.merge(features_df, labels_df, on="bank_transaction_id")
combined_df.head()

Unnamed: 0,bank_transaction_id,bank_transaction_description,bank_transaction_amount,bank_transaction_type,bank_transaction_category,bank_transaction_dataset
0,21786195,citylink,-13.8,MPO,TRAVEL,TRAIN
1,21786196,citylink,-13.14,DEB,TRAVEL,TRAIN
2,21786197,1Jul19 OYSTER,-36.98,DEB,TRAVEL,TRAIN
3,21786198,travelodge,-75.73,MPO,TRAVEL,TRAIN
4,21786199,6Jul19 RINGGO,-37.86,CSH,TRAVEL,TRAIN


## Part 1: Visualising Data

In [3]:
print("- features_df shape:", features_df.shape)
print("- labels_df shape:", labels_df.shape)
print("- combined_df shape:", combined_df.shape)
print("\n- combined_df info:")
combined_df.info()

- features_df shape: (12500, 4)
- labels_df shape: (12500, 3)
- combined_df shape: (12500, 6)

- combined_df info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12500 entries, 0 to 12499
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   bank_transaction_id           12500 non-null  int64  
 1   bank_transaction_description  12369 non-null  object 
 2   bank_transaction_amount       12500 non-null  float64
 3   bank_transaction_type         12500 non-null  object 
 4   bank_transaction_category     12500 non-null  object 
 5   bank_transaction_dataset      12500 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 683.6+ KB


In [4]:
# Replace null values in transaction description with empty string
combined_df["bank_transaction_description"].fillna("", inplace=True)

- bank_transaction_description column has 131 null values. Due to this column containing string descriptions of the transactions I will fill these null cells with empty strings ("")

In [5]:
# Generate a bar chart of the count distribution of transaction types
def type_dist_graph(df):
    chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x=alt.X("bank_transaction_type", title="Count"),
            y=alt.Y("index", title="Transaction Type"),
            color=alt.Color("bank_transaction_type:N", legend=None),
        )
    )
    return chart


# Generate a bar chart of the count distribution of transaction categories
def cat_dist_graph(df):
    chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x=alt.X("bank_transaction_category", title="Count"),
            y=alt.Y("index", title="Transaction Category"),
            color=alt.Color("bank_transaction_category:N", legend=None),
        )
    )
    return chart


# Given a df return graphs for the transaction type and category distributions
def show_graphs(df):
    df_type = df["bank_transaction_type"].value_counts().to_frame().reset_index()
    df_cat = df["bank_transaction_category"].value_counts().to_frame().reset_index()
    return type_dist_graph(df_type) | cat_dist_graph(df_cat)

In [6]:
train_df = combined_df[combined_df.bank_transaction_dataset == "TRAIN"]
val_df = combined_df[combined_df.bank_transaction_dataset == "VAL"]
print("train samples count:", len(train_df))
print("val samples count", len(val_df))

train samples count: 10000
val samples count 2500


- Graphically visualise dataset distributions

In [7]:
print("Combined data distributions ---")
show_graphs(combined_df)

Combined data distributions ---


In [8]:
print("Train distributions ---")
show_graphs(train_df)

Train distributions ---


In [9]:
print("Val distributions ---")
show_graphs(val_df)

Val distributions ---


- There is class inbalance in the dataset with many more training examples for *ACCOMMODATION_AND_MEALS*, *TRAVEL*, *BANK_OR_FINANCE_CHARGES* than for *MOTOR_EXPENSES* and *INSURANCE*
- Distribution are fairly similar between train and validation set

In [10]:
print(
    "combined_df transaction amount mean:",
    combined_df["bank_transaction_amount"].mean(),
)
print(
    "train_df transaction amount mean:",
    train_df["bank_transaction_amount"].mean(),
)
print(
    "val_df transaction amount mean:",
    val_df["bank_transaction_amount"].mean(),
)

combined_df transaction amount mean: -19.613016800000057
train_df transaction amount mean: -20.073671000000086
val_df transaction amount mean: -17.770399999999967


- A 2.3 difference in transaction amount between train and val data, this difference is larger than I would expect for large randomly distributed data

## Part 2: Classifier

I have been using BERT in a project recently to extract semantic meaning from words. The task of trying to predict categories from natural sentences immediately appeared to be an ideal task to utilise the BERT model. I have used the pretrained sequence classification BERT model and then fine tuned it for 5 epochs on the bank transaction training data. I believe the model shows good results but could definitely be improved (as detailed in the final cell of this notebook).

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

from transformers import AdamW, get_linear_schedule_with_warmup

from transformers import BertForSequenceClassification
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

device = "cuda:0"
batch_size = 192

- Defining a Transaction_Dataset to store and load samples from

In [12]:
# Encode category values to integers, i.e 'TRAVEL' is replace with '0'
combined_df.bank_transaction_type = pd.factorize(combined_df.bank_transaction_type)[0]
# True cats stores the category labels with index corresponding to integer representations, i.e true_cats[0] = 'TRAVEL'
combined_df.bank_transaction_category, true_cats = pd.factorize(
    combined_df.bank_transaction_category
)

# Split data into train and val
train_df = combined_df[combined_df.bank_transaction_dataset == "TRAIN"]
val_df = combined_df[combined_df.bank_transaction_dataset == "VAL"]


# Define class to stored and load tranctional data
class Transaction_Dataset(Dataset):
    def __init__(self, transactions_df):
        self.transactions = []
        for _, row in transactions_df.iterrows():
            t = {}

            t["desc"] = row["bank_transaction_description"]
            t["cat"] = row["bank_transaction_category"]

            self.transactions.append(t)

    def __len__(self):
        return len(self.transactions)

    def __getitem__(self, idx):
        t = self.transactions[idx]

        bert_tokens = get_bert_tokens(t["desc"])

        label = int(t["cat"])
        label = torch.tensor(label)

        return bert_tokens.to(device), label.to(device)


# Function takes a string and returns the generated bert tokens, padded/truncated to a max_length = 10
def get_bert_tokens(desc):
    tokens = torch.tensor(
        tokenizer.encode(
            desc, max_length=10, padding="max_length", truncation="longest_first"
        )
    )
    return tokens

- Get dataloaders

In [13]:
train_data = Transaction_Dataset(train_df)
val_data = Transaction_Dataset(val_df)

# Get train/val data loader randomly shuffled
train_loader = torch.utils.data.DataLoader(
    train_data, batch_size=batch_size, shuffle=True, num_workers=0
)
val_loader = torch.utils.data.DataLoader(
    val_data, batch_size=batch_size, shuffle=True, num_workers=0
)

print("Train batch count =", len(train_loader))
print("Val batch count =", len(val_loader))

Train batch count = 53
Val batch count = 14


- Define some helper functions used to evaluate the model

In [14]:
from sklearn.metrics import f1_score

# Get f1 score given predictions and their true values
def get_f1_score(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average="weighted")


# Get model accuracy and accuracy per class (returned as dictionary)
def accuracy_per_class(preds, labels):
    correct = 0
    acc_per_class = {}
    for cat in true_cats:
        acc_per_class[cat] = [0, 0]

    for i in range(len(preds)):

        if np.argmax(preds[i]) == labels[i]:
            correct += 1
            acc_per_class[true_cats[labels[i]]][0] += 1

        acc_per_class[true_cats[labels[i]]][1] += 1
    return round((correct / len(preds)) * 100, 2), acc_per_class


# Evaluate the models performance
def evaluate(val_loader):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for tokens, labels in val_loader:
        inputs = {"input_ids": tokens, "labels": labels}

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs["labels"].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total / len(val_loader)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

- Get pretrained bert model

In [15]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

- Training loop

In [16]:
# Fine tune model for 5 epochs
epochs = 5

# Using AdamW and linear scheduler with warmup as recommend by authors of similar projects (https://towardsdatascience.com/multi-label-multi-class-text-classification-with-bert-transformer-and-keras-c6355eccb63a, https://towardsdatascience.com/multi-class-text-classification-with-deep-learning-using-bert-b59ca2f5c613)
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs
)

# Typical training loop
for epoch in tqdm(range(1, epochs + 1)):
    print("  - Epoch", epoch)

    model.train()
    loss_train_total = 0

    for tokens, labels in train_loader:
        model.zero_grad()

        inputs = {"input_ids": tokens, "labels": labels}
        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    # Print epoch statistics
    loss_train_avg = loss_train_total / len(train_loader)
    print("Average train loss:", round(loss_train_avg, 2))

    val_loss, predictions, true_vals = evaluate(val_loader)
    val_f1 = get_f1_score(predictions, true_vals)
    print("Validation loss:", round(val_loss, 2))
    print("F1 score", round(val_f1, 2))
    print("\n")

  0%|          | 0/5 [00:00<?, ?it/s]

  - Epoch 1
Average train loss: 1.4
Validation loss: 1.1
F1 score 0.59


  - Epoch 2
Average train loss: 0.9
Validation loss: 0.63
F1 score 0.78


  - Epoch 3
Average train loss: 0.6
Validation loss: 0.49
F1 score 0.84


  - Epoch 4
Average train loss: 0.49
Validation loss: 0.42
F1 score 0.85


  - Epoch 5
Average train loss: 0.45
Validation loss: 0.5
F1 score 0.86




 - Evaluate final model performance

In [17]:
# Get val predictions
_, predictions, true_vals = evaluate(val_loader)

# Get model accuracy (acc) and accuracy per class (acc_per_class)
acc, acc_per_class = accuracy_per_class(predictions, true_vals)

# Get f1 scores
f1 = get_f1_score(predictions, true_vals)

# Print metrics
print("Model Accuracy:", acc)
print("F1 score:", f1)
print()

for key, val in acc_per_class.items():
    class_acc = round((val[0] / val[1]) * 100, 2)
    print(" - " + key, class_acc)

Model Accuracy: 85.72
F1 score: 0.8552536952633207

 - TRAVEL 81.12
 - MOTOR_EXPENSES 69.79
 - ACCOMMODATION_AND_MEALS 88.1
 - BANK_OR_FINANCE_CHARGES 96.11
 - INSURANCE 90.8


### Test some sample transaction descriptions for fun

In [18]:
def get_prediction(desc):
    tokens = get_bert_tokens(desc)

    desc_tensor = torch.Tensor(1, 10).long().to(device)
    desc_tensor[0] = tokens

    model_in = {"input_ids": desc_tensor}
    output = model(**model_in)

    return true_cats[torch.argmax(output[0]).item()]

In [19]:
get_prediction("motorway toll bridge")

'TRAVEL'

In [20]:
get_prediction("a very large coffee order")

'ACCOMMODATION_AND_MEALS'

In [21]:
get_prediction("money to cover dental operation")

'INSURANCE'

## Model improvements

The first step I would take to improve the model is to utilise the other features availible for transactions. Then fine tuning of the model hyperparameters (epochs, lr, scheduler...) should help extract slightly better performance.