In [None]:
# setup the environment 
# conda env create -f environment.yml

# activate the environment
# conda activate mlproject

#if you prefer to set up the environment by yourself, you may create Python virtual environment and install the following packages
# !pip install pandas numpy torch transformers datasets scikit-learn evaluate

In [None]:
# import library 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import torch
import evaluate

### Data Preparation

In [None]:
# load the dataset
df = pd.read_csv('data/training-data.csv')

# display first few rows to understand the data structure
print(df.head())

#check the shape of the loaded_dataset
print() #print a blank line
print(f"The shape of rows and columns in this dataset is: {df.shape}")

In [None]:
# check the column is properly labelled
print(df.columns)

# there are blank spaces before and after the column name, we should remove it. we can do so by using strip function
df.columns = df.columns.str.strip()

print(df.columns) 

In [None]:
# check the datatypes
print(df.dtypes)

print()
# check if any missing value in each columns
print(df.isnull().any())

### Feature Engineering

1. We can perform feature engineering to transform raw data into features/ pattern that are easier to be recognised.
2. We do it by doing structured concatenation which we explicitly labelling each data, by doing so hopefully it will help model to easily recognised the pattern on the data.

In [None]:
# Concatenate relevant columns into a single string per row with explicit labels
df['combined_text'] = df.apply(lambda row: f"Bank: {row['Bank']} Currency: {row['Currency']} Date: {row['Date']} Amount: {row['Amount']} Text: {row['Text']}", axis=1)

In [None]:
df.combined_text.shape

In [None]:
# lets check if the maximum combined length for 1 token exceed the maximum length of a token in the model. 

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

# Tokenize the combined texts and calculate the token lengths
token_lengths = df['combined_text'].apply(lambda x: len(tokenizer(x, truncation=False)['input_ids']))

# Print the maximum token length
max_token_length = token_lengths.max()
print(f"Maximum token length of combined text: {max_token_length}")

# Check if the maximum token length exceeds the model's limit
max_model_length = tokenizer.model_max_length
print(f"Model maximum token length: {max_model_length}")

In [None]:
# Convert labels to numerical form
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Remark'])

In [None]:
# Get the mapping of original labels to encoded labels
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print(label_mapping)

In [None]:
# we will use HuggingFace Dataset Module for Training as it is better integration with Transfomers Module.
# in Dataset module, data are load in dict format
# lets perform final check the df dataset after transfom, it should include two additional columns: 'combined_text' & 'label'
df

In [None]:
# create a new df that only stored the columns needed for fine-tune training 
df_final = df[['combined_text','label']]
df_final

In [None]:
# export the processed df as a csv file, unfilter the code to process
df_final.to_csv('data/training-data-pd-out.csv', index=False)

In [None]:
# load the processed csv file with two column "input" & 'label' under dataset library
from datasets import load_dataset

dataset = load_dataset("csv", data_files="data/training-data-pd-out.csv")

In [None]:
# print the dataset 
dataset

In [None]:
# keep track on the original size of the dataset, it should matched with the total size after split  
original_size = dataset["train"].num_rows
print(f"Original_size of dataset: {original_size}")

# split the dataset into training set and testing set with a ratio of 80:20
dataset_final = dataset["train"].train_test_split(test_size=0.2, shuffle=True, seed=42)

train_size = dataset_final["train"].num_rows
test_size = dataset_final["test"].num_rows
total_size = train_size + test_size

print("Training dataset size:", train_size)
print("Testing dataset size:", test_size)
print("Total size after split:", total_size)

In [None]:
# final check on the dataset_final to see if the columns & row are correct before we perform fine-tune training
dataset_final

### Fine-tune Training

In [None]:
checkpoint = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["combined_text"], truncation=True)


tokenized_datasets = dataset_final.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# check the dataset after the features are mapped and tokenized into format that the model can read
tokenized_datasets

In [None]:
# remove columns that model does not expect
tokenized_datasets = tokenized_datasets.remove_columns(["combined_text"])

# rename the column label to labels as the model expects the argument to be named "labels"
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

# Set the format of the datasets so they return PyTorch tensors instead of lists
tokenized_datasets.set_format("torch")

# check the result of the column in train & test
print(tokenized_datasets["train"].column_names)
print(tokenized_datasets["test"].column_names)

In [None]:
# now we have loaded the pre-training model, tokenized the dataset. we are good to start the fine-tune training!
# from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer

# define training arguments & load the pre-trained model
training_args = TrainingArguments(
    "model/test-trainer",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01
)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=11)

In [None]:
# import numpy as np
# import evaluate

# define the compute_metrics function
def compute_metrics(eval_preds):
    metric = evaluate.load("accuracy")
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
# Train the model
trainer.train()

### Evaluation

In [None]:
# these are the codes to evaluate the training result (kind of unnecessary because we already include evaluate function when fine tune the model)
evaluation_results = trainer.evaluate()
print(evaluation_results)

In [None]:
# we can manually calculate the accuracy too
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

In [None]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

import evaluate

metric = evaluate.load("accuracy")
metric.compute(predictions=preds, references=predictions.label_ids)

### Save the fine-tuned Model

In [None]:
# Save the trained model
model.save_pretrained("model/bank-classifier-model")

# Save the tokenizer
tokenizer.save_pretrained("model/bank-classifier-tokenizer")