# COMP34812 Coursework

## Setup Code

In [37]:
!pip install torchmetrics

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
import torchmetrics
import pandas as pd
import csv

from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel

## File Path Constants

In [39]:
# Constants
MODEL_PATH = "bert-base-uncased"
TRAINING_DATASET_LOCATION = "/content/drive/MyDrive/NLU/train.csv"
DEV_DATASET_LOCATION = "/content/drive/MyDrive//NLU/dev.csv"
EVAL_DATASET_LOCATION = "/content/drive/MyDrive/NLU/AV_trial.csv"
TEST_DATASET_LOCATION = "/content/drive/MyDrive/NLU/test.csv"
PREDICTION_DATASET_LOCATION = "/content/drive/MyDrive/NLU/predictions.csv"
SAVED_MODEL_PATH = "/content/drive/MyDrive/NLU/modelBaseTrain"

## Bert Loading

In [40]:
# Load Tokeniser and Model from MODEL_PATH(Bert Base Uncased)
bertTokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
bertModel = AutoModel.from_pretrained(MODEL_PATH)

## Class Definitions

In [41]:
# Siamese Dataset Class for Model Training
class SiameseDataset(Dataset):
  def __init__(self, csvFile:str) -> None:
    # df = pd.read_csv(csvFile)
    # self.texts1 = df["text_1"].tolist()
    # self.texts2 = df["text_2"].tolist()
    # self.labels = df["label"].tolist()

    self.texts1 = []
    self.texts2 = []
    self.labels = []
    with open(csvFile, newline='') as f:
      reader = reader = csv.DictReader(f)
      for row in reader:
        self.texts1.append(row["text_1"])
        self.texts2.append(row["text_2"])
        self.labels.append(row["label"])
    self.labels = np.asfarray(self.labels)

  def __len__(self) -> int:
    return len(self.labels)

  def __getitem__(self, index:int) -> tuple[str,str,torch.Tensor]:
    text1 = self.texts1[index]
    text2 = self.texts2[index]
    label = self.labels[index]
    return text1, text2, torch.tensor(label)

In [42]:
# Siamese Dataset Class for Inference
class SiameseInferenceDataset(Dataset):
  def __init__(self, csvFile:str) -> None:
    # df = pd.read_csv(csvFile)
    # self.texts1 = df["text_1"].tolist()
    # self.texts2 = df["text_2"].tolist()

    self.texts1 = []
    self.texts2 = []
    with open(csvFile, newline='', encoding='utf-8-sig') as f:
      reader = reader = csv.DictReader(f)
      for row in reader:
        self.texts1.append(row['text_1'])
        self.texts2.append(row['text_2'])

  def __len__(self) -> int:
    return len(self.texts1)

  def __getitem__(self, index:int) -> tuple[str,str]:
    return self.texts1[index], self.texts2[index]

In [43]:
# Siamese Model, uses one base model to generate outputs for two inputs
class SiameseModel(nn.Module):
  def __init__(self, baseModel):
    super(SiameseModel, self).__init__()
    self.baseModel = baseModel

  def forward(self, inputs1, inputs2):
    output1 = self.baseModel(**inputs1).last_hidden_state[:, 0, :].squeeze()
    output2 = self.baseModel(**inputs2).last_hidden_state[:, 0, :].squeeze()
    return output1, output2

In [44]:
# Contrastive Loss function for training
class ContrastiveLoss(nn.Module):
  def __init__(self, margin=1.0):
    super(ContrastiveLoss, self).__init__()
    # Margin value for interclass distance and intraclass spacing
    self.margin = margin

  def forward(self, outputs1, outputs2, labels):
    distance = nn.functional.pairwise_distance(outputs1, outputs2, keepdim=True)
    # Loss for similar labels
    simLoss = (1 - labels) * torch.pow(distance, 2)
    # Loss for dissimilar labels
    diff = torch.clamp(self.margin - distance, min=0.0)
    disLoss = (labels) * torch.pow(diff, 2)
    # Total Loss
    loss = torch.mean(simLoss + disLoss)
    return loss

## Model and Dataset Loading

In [45]:
# Load train dataset
dataset = SiameseDataset(TRAINING_DATASET_LOCATION)
dataLoader = DataLoader(dataset, batch_size=16, shuffle=True)

In [46]:
# Load model from bert base model
siameseModel = SiameseModel(bertModel)

optimiser = optim.Adam(siameseModel.parameters(), lr=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimiser, step_size=3, gamma=0.1)

lossFunction = ContrastiveLoss(margin = 1)

## Training Loop

In [47]:
def trainModel(epochs, savePath):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  print(f"Device: {device}")
  siameseModel.to(device)
  for epoch in range(epochs):
    batchLoss = 0
    for text1, text2, labels in dataLoader:

      # Tokenise inputs
      inputs1 = bertTokenizer(text1, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
      inputs2 = bertTokenizer(text2, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

      # Map to device
      inputs1 = inputs1.to(device)
      inputs2 = inputs2.to(device)
      labels = labels.to(device)


      optimiser.zero_grad()
      # Generate outputs
      output1, output2 = siameseModel(inputs1, inputs2)
      # Calculate contrastive loss
      loss = lossFunction(output1, output2, labels)
      batchLoss += loss.item()
      loss.backward()
      optimiser.step()

    scheduler.step()
    print(f"---------------------EPOCH {epoch+1} / {epochs}---------------------")
    print(f"Batch Loss {batchLoss}")

  torch.save(siameseModel.state_dict(), savePath)

In [48]:
# 12473s on training set
trainModel(10,"/content/drive/MyDrive/NLU/modelBaseDev3")

Device: cuda


KeyboardInterrupt: 

## Saved Model Loading

In [49]:
# Load model from saved file
siameseModel = SiameseModel(bertModel)
siameseModel.load_state_dict(torch.load(SAVED_MODEL_PATH))

<All keys matched successfully>

## Model Evaluation

In [51]:
siameseModel.eval()

# Create inference dataset, no labels
inferenceDataset = SiameseInferenceDataset(EVAL_DATASET_LOCATION)
inferenceDataLoader = DataLoader(inferenceDataset, batch_size=16, shuffle=True)
predictions = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
siameseModel.to(device)

with torch.no_grad():
  for text1, text2 in inferenceDataLoader:

    inputs1 = bertTokenizer(text1, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    inputs2 = bertTokenizer(text2, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    inputs1 = inputs1.to(device)
    inputs2 = inputs2.to(device)

    outputs1, outputs2 = siameseModel(inputs1, inputs2)
    distances = torch.nn.functional.pairwise_distance(outputs1, outputs2)
    predictions.extend(distances.cpu().numpy())

Device: cuda


In [52]:
labels = []
with open(PREDICTION_DATASET_LOCATION, newline='') as csvfile:
  reader = csv.DictReader(csvfile)
  for row in reader:
    labels.append(row["prediction"])
labels = torch.tensor(np.asfarray(labels))

accuracy = torchmetrics.Accuracy(task="binary")
f1_score = torchmetrics.F1Score(task="binary")
precision = torchmetrics.Precision(task="binary")
recall = torchmetrics.Recall(task="binary")
mcc = torchmetrics.MatthewsCorrCoef(task="binary")
cohens_kappa = torchmetrics.CohenKappa(task="binary")

Testing = False
for i in range(1,9):
  print('threshold:',i/10)
  threshold = i/10
  predicted_labels = torch.tensor([1 if distance < threshold else 0 for distance in predictions])
  print(f"Accuracy: {accuracy(predicted_labels, labels):.2f}")
  if(Testing):
    print(f"Macro Precision: {precision(predicted_labels, labels):.2f}")
    print(f"Macro Recall: {recall(predicted_labels, labels):.2f}")
    print(f"Macro F1-Score: {f1_score(predicted_labels, labels):.2f}")

threshold: 0.1
Accuracy: 0.54
threshold: 0.2
Accuracy: 0.54
threshold: 0.3
Accuracy: 0.54
threshold: 0.4
Accuracy: 0.54
threshold: 0.5
Accuracy: 0.42
threshold: 0.6
Accuracy: 0.46
threshold: 0.7
Accuracy: 0.46
threshold: 0.8
Accuracy: 0.46


In [35]:
# Calculate scores on threshold of 0.5
predictedLabels = torch.tensor([1 if distance < 0.5 else 0 for distance in predictions])
print(labels[:50], predictedLabels[:50])
print(f"Accuracy: {accuracy(predictedLabels, labels):.2f}")
print(f"Macro Precision: {precision(predictedLabels, labels):.2f}")
print(f"Macro Recall: {recall(predictedLabels, labels):.2f}")
print(f"Macro F1-Score: {f1_score(predictedLabels, labels):.2f}")
print(f"MCC: {mcc(predictedLabels, labels):.2f}")
print(f"Cohens Kappa: {cohens_kappa(predictedLabels, labels):.2f}")

tensor([0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 0.,
        0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 1.,
        0., 0., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1.],
       dtype=torch.float64) tensor([0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1,
        1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 0])
Accuracy: 0.54
Macro Precision: 0.50
Macro Recall: 0.52
Macro F1-Score: 0.51
MCC: 0.08
Cohens Kappa: 0.08


## Prediction Output

In [36]:
siameseModel.eval()
inferenceDataset = SiameseInferenceDataset(TEST_DATASET_LOCATION)
inferenceDataLoader = DataLoader(inferenceDataset, batch_size=16, shuffle=True)
predictions = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
siameseModel.to(device)

with torch.no_grad():
  for text1, text2 in inferenceDataLoader:

    inputs1 = bertTokenizer(text1, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    inputs2 = bertTokenizer(text2, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    inputs1 = inputs1.to(device)
    inputs2 = inputs2.to(device)

    outputs1, outputs2 = siameseModel(inputs1, inputs2)
    distances = torch.nn.functional.pairwise_distance(outputs1, outputs2)
    predictions.extend(distances.cpu().numpy())

predictedLabels = torch.tensor([1 if distance < 0.5 else 0 for distance in predictions])

with open("Group_83_C.csv", 'w', newline='') as csvfile:
  writer = csv.DictWriter(csvfile, fieldnames=['prediction'])
  writer.writeheader()
  for p in predictedLabels:
    writer.writerow({'prediction': p.item()})
labels = torch.tensor(np.asfarray(labels))

Device: cuda
