# COMP34812 Coursework

## Setup Code

In [1]:
!pip install torchmetrics

from google.colab import drive
drive.mount('/content/drive')

Collecting torchmetrics
  Downloading torchmetrics-1.3.2-py3-none-any.whl (841 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m841.5/841.5 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.2-py3-none-any.whl (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->torchmetrics)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->torchmetrics)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->torchmetrics)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->torchmetrics)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collectin

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
import torchmetrics
import pandas as pd
import csv

from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel

## File Path Constants

In [3]:
MODEL_PATH = "bert-base-uncased"
TEST_MODEL_LOCATION = "/content/drive/MyDrive/NLU/modelBaseTrain"
EVAL_DATASET_LOCATION = "/content/drive/MyDrive/NLU/AV_trial.csv"
TEST_DATASET_LOCATION = "/content/drive/MyDrive/NLU/test.csv"
PREDICTION_DATASET_LOCATION = "/content/drive/MyDrive/NLU/predictions.csv"

## Class Definitions

In [4]:
# Siamese Dataset Class for Inference
class SiameseInferenceDataset(Dataset):
  def __init__(self, csvFile:str) -> None:
    # df = pd.read_csv(csvFile)
    # self.texts1 = df["text_1"].tolist()
    # self.texts2 = df["text_2"].tolist()

    self.texts1 = []
    self.texts2 = []
    with open(csvFile, newline='', encoding='utf-8-sig') as csvfile:
      reader = reader = csv.DictReader(csvfile)

      for row in reader:
        self.texts1.append(row['text_1'])
        self.texts2.append(row['text_2'])

  def __len__(self) -> int:
    return len(self.texts1)

  def __getitem__(self, index:int) -> tuple[str,str]:
    return self.texts1[index], self.texts2[index]

In [5]:
# Siamese Model, uses one base model to generate outputs for two inputs
class SiameseModel(nn.Module):
  def __init__(self, baseModel):
    super(SiameseModel, self).__init__()
    self.baseModel = baseModel

  def forward(self, inputs1, inputs2):
    output1 = self.baseModel(**inputs1).last_hidden_state[:, 0, :].squeeze()
    output2 = self.baseModel(**inputs2).last_hidden_state[:, 0, :].squeeze()
    return output1, output2

## Saved Model Loading

In [6]:
# Load Tokeniser and Model from MODEL_PATH(Bert Base Uncased)
bertTokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
bertModel = AutoModel.from_pretrained(MODEL_PATH)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [8]:
# Load siamese model
siameseModel = SiameseModel(bertModel)
siameseModel.load_state_dict(torch.load(TEST_MODEL_LOCATION))
siameseModel.eval()

SiameseModel(
  (baseModel): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

## Model Testing

In [9]:
dataset = SiameseInferenceDataset(EVAL_DATASET_LOCATION)
dataLoader = DataLoader(dataset, batch_size=16, shuffle=True)
predictions = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

siameseModel.to(device)

with torch.no_grad():
  for text1, text2 in dataLoader:

    inputs1 = bertTokenizer(text1, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    inputs2 = bertTokenizer(text2, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    inputs1 = inputs1.to(device)
    inputs2 = inputs2.to(device)

    outputs1, outputs2 = siameseModel(inputs1, inputs2)
    distances = torch.nn.functional.pairwise_distance(outputs1, outputs2)
    predictions.extend(distances.cpu().numpy())

Device: cuda


In [10]:
accuracy = torchmetrics.Accuracy(task="binary")
f1_score = torchmetrics.F1Score(task="binary")

labels = []
with open(PREDICTION_DATASET_LOCATION, newline='') as csvfile:
  reader = csv.DictReader(csvfile)
  for row in reader:
    labels.append(row["prediction"])
labels = torch.tensor(np.asfarray(labels))

predictedLabels = torch.tensor([1 if distance < 0.5 else 0 for distance in predictions])
print(f"Accuracy: {accuracy(predictedLabels, labels):.2f}")
print(f"Macro F1-Score: {f1_score(predictedLabels, labels):.2f}")

Accuracy: 0.46
Macro F1-Score: 0.43


## Prediction Section

In [11]:
siameseModel.eval()
inferenceDataset = SiameseInferenceDataset(TEST_DATASET_LOCATION)
inferenceDataLoader = DataLoader(inferenceDataset, batch_size=16, shuffle=True)
predictions = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
siameseModel.to(device)

with torch.no_grad():
  for text1, text2 in inferenceDataLoader:

    inputs1 = bertTokenizer(text1, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    inputs2 = bertTokenizer(text2, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    inputs1 = inputs1.to(device)
    inputs2 = inputs2.to(device)

    outputs1, outputs2 = siameseModel(inputs1, inputs2)
    distances = torch.nn.functional.pairwise_distance(outputs1, outputs2)
    predictions.extend(distances.cpu().numpy())

predictedLabels = torch.tensor([1 if distance < 0.5 else 0 for distance in predictions])

with open("Group_83_C.csv", 'w', newline='') as csvfile:
  writer = csv.DictWriter(csvfile, fieldnames=['prediction'])
  writer.writeheader()
  for p in predictedLabels:
    writer.writerow({'prediction': p.item()})
labels = torch.tensor(np.asfarray(labels))

Device: cuda
