In [1]:
import transformers as T
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from torchmetrics import SpearmanCorrCoef, Accuracy, F1Score

In [2]:
token_replacement = [
    ["：" , ":"],
    ["，" , ","],
    ["“" , "\""],
    ["”" , "\""],
    ["？" , "?"],
    ["……" , "..."],
    ["！" , "!"]
]

In [3]:
class SemevalDataset(Dataset):
    def __init__(self, split="train") -> None:
        super().__init__()
        assert split in ["train", "validation", "test"]
        self.data = load_dataset(
            "sem_eval_2014_task_1", split=split, cache_dir="./cache/", trust_remote_code=True
        ).to_list()

    def __getitem__(self, index):
        d = self.data[index]
        # Token replacement
        for k in ["premise", "hypothesis"]:
            for tok in token_replacement:
                d[k] = d[k].replace(tok[0], tok[1])
        return d

    def __len__(self):
        return len(self.data)

data_sample = SemevalDataset(split="train").data[:3]
print(f"Dataset example: \n{data_sample[0]} \n{data_sample[1]} \n{data_sample[2]}")

Dataset example: 
{'sentence_pair_id': 1, 'premise': 'A group of kids is playing in a yard and an old man is standing in the background', 'hypothesis': 'A group of boys in a yard is playing and a man is standing in the background', 'relatedness_score': 4.5, 'entailment_judgment': 0} 
{'sentence_pair_id': 2, 'premise': 'A group of children is playing in the house and there is no man standing in the background', 'hypothesis': 'A group of kids is playing in a yard and an old man is standing in the background', 'relatedness_score': 3.200000047683716, 'entailment_judgment': 0} 
{'sentence_pair_id': 3, 'premise': 'The young boys are playing outdoors and the man is smiling nearby', 'hypothesis': 'The kids are playing outdoors near a man with a smile', 'relatedness_score': 4.699999809265137, 'entailment_judgment': 1}


In [4]:
# Define the hyperparameters
lr = 3e-5
epochs = 10
train_batch_size = 64
validation_batch_size = 64
test_batch_size = 64

In [5]:
tokenizer = T.BertTokenizer.from_pretrained("google-bert/bert-large-uncased", cache_dir="./cache/")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device = "mps" if torch.backends.mps.is_available() else "cpu"

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

In [6]:
# TODO1: Create batched data for DataLoader
# `collate_fn` is a function that defines how the data batch should be packed.
# This function will be called in the DataLoader to pack the data batch.

def collate_fn(batch):
    # TODO1-1: Implement the collate_fn function
    # Write your code here
    # The input parameter is a data batch (tuple), and this function packs it into tensors.
    # Use tokenizer to pack tokens and pack the data and its corresponding labels.
    # Return the data batch and labels for each sub-task.
    complete_text = [
        f"{example['premise']}[SEP]{example['hypothesis']}"
        for example in batch
    ]
    complete_text = tokenizer.batch_encode_plus(
        complete_text,
        padding=True,
        truncation=True,
        return_tensors="pt",
        add_special_tokens=False,
    )
    complete_text['labels_reg'] = torch.tensor([b['relatedness_score'] for b in batch])
    complete_text['labels_cls'] = torch.tensor([b['entailment_judgment'] for b in batch])
    
    # Move the data to the device
    complete_text = {k: complete_text[k].to(device) for k in complete_text}
    
    return complete_text

# TODO1-2: Define your DataLoader
ds_train = SemevalDataset("train")
ds_validation = SemevalDataset("validation")

dl_train = DataLoader(ds_train, batch_size=train_batch_size, shuffle=True, collate_fn=collate_fn)
dl_validation = DataLoader(ds_validation, batch_size=validation_batch_size, shuffle=False, collate_fn=collate_fn)

ds_test = SemevalDataset("test")
dl_test = DataLoader(ds_test, batch_size=test_batch_size, shuffle=False, collate_fn=collate_fn)

In [12]:
# TODO2: Construct your model
class MultiLabelModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # Write your code here
        # Define what modules you will use in the model
        self.bert = T.BertModel.from_pretrained("google-bert/bert-large-uncased", cache_dir="./cache/")
        self.regression_head = torch.nn.Sequential(
            torch.nn.Linear(1024, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 1)
        )
        self.classification_head = torch.nn.Sequential(
            torch.nn.Linear(1024, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 3)
        )
        
    def forward(self, **inputs):
        # Write your code here
        # Forward pass
        # BERT
        bert_output = self.bert(inputs['input_ids'], inputs['attention_mask'], inputs['token_type_ids'])
        # [CLS] token hidden state
        cls_token_output = bert_output.last_hidden_state[:, 0, :]
        # Task-specific heads
        reg_output = self.regression_head(cls_token_output)
        cls_output = self.classification_head(cls_token_output)

        return reg_output.squeeze(-1), cls_output

In [13]:
# Use both GPUs
model = MultiLabelModel()

In [14]:
# TODO3: Define your optimizer and loss function

# TODO3-1: Define your Optimizer
# optimizer = torch.optim.AdamW(model.parameters(), lr=lr)# Write your code here
optimizer = torch.optim.AdamW([
    {'params': model.bert.parameters(), 'lr': 3e-5},
    {'params': model.regression_head.parameters(), 'lr': 3e-3},
    {'params': model.classification_head.parameters(), 'lr': 3e-3}
])

# TODO3-2: Define your loss functions (you should have two)
# Write your code here
criterion_reg = torch.nn.MSELoss()  # Regression loss
criterion_cls = torch.nn.CrossEntropyLoss()  # Classification loss

# scoring functions
spc = SpearmanCorrCoef().to(device)
acc = Accuracy(task="multiclass", num_classes=3).to(device)
f1 = F1Score(task="multiclass", num_classes=3, average='macro').to(device)

In [15]:
# This is the sample code from Pytorch
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = torch.nn.DataParallel(model)
    
model = model.to(device)

Let's use 2 GPUs!


In [None]:
%mkdir ./saved_models

In [16]:
for ep in range(epochs):
    pbar = tqdm(dl_train)
    pbar.set_description(f"Training epoch [{ep+1}/{epochs}]")
    model.train()
    # TODO4: Write the training loop
    # Write your code here
    for inputs in pbar:
        # train your model
        # clear gradient
        optimizer.zero_grad()
        # forward pass
        outputs_reg, outputs_cls = model(**inputs)
        # compute loss
        loss_reg = criterion_reg(outputs_reg, inputs['labels_reg'])
        loss_cls = criterion_cls(outputs_cls, inputs['labels_cls'])
        # back-propagation
        loss = loss_reg + loss_cls
        loss.backward()
        # model optimization
        optimizer.step()
        # update progress bar
        pbar.set_postfix(loss=loss.item())

    pbar = tqdm(dl_validation)
    pbar.set_description(f"Validation epoch [{ep+1}/{epochs}]")
    model.eval()
    # TODO5: Write the evaluation loop
    # Write your code here
    for inputs in pbar:
        # Evaluate your model
        outputs_reg, outputs_cls = None, None
        with torch.no_grad():
            outputs_reg, outputs_cls = model(**inputs)
        # Output all the evaluation scores (SpearmanCorrCoef, Accuracy, F1Score)
        pred_reg = outputs_reg
        pred_cls = outputs_cls.argmax(dim=1)
        spc.update(pred_reg, inputs['labels_reg'])
        acc.update(pred_cls, inputs['labels_cls'])
        f1.update(pred_cls, inputs['labels_cls'])

    # Print the evaluation scores
    print(f'Spearman CorrCoef: {spc.compute()}')
    print(f'Accuracy: {acc.compute()}')
    print(f'F1 Score: {f1.compute()}')
    # Reset the evaluation metrics
    spc.reset()
    acc.reset()
    f1.reset()
    # Save the model
    torch.save(model, f'./saved_models/ep{ep}.ckpt')

Training epoch [1/10]: 100%|██████████| 71/71 [01:18<00:00,  1.11s/it, loss=0.699]
Validation epoch [1/10]: 100%|██████████| 8/8 [00:03<00:00,  2.35it/s]


Spearman CorrCoef: 0.7720714807510376
Accuracy: 0.7940000295639038
F1 Score: 0.8015458583831787


Training epoch [2/10]: 100%|██████████| 71/71 [01:19<00:00,  1.12s/it, loss=0.646]
Validation epoch [2/10]: 100%|██████████| 8/8 [00:03<00:00,  2.39it/s]


Spearman CorrCoef: 0.8195289373397827
Accuracy: 0.8360000252723694
F1 Score: 0.8285194635391235


Training epoch [3/10]: 100%|██████████| 71/71 [01:18<00:00,  1.11s/it, loss=1.03] 
Validation epoch [3/10]: 100%|██████████| 8/8 [00:03<00:00,  2.32it/s]


Spearman CorrCoef: 0.8241651058197021
Accuracy: 0.8579999804496765
F1 Score: 0.850672721862793


Training epoch [4/10]: 100%|██████████| 71/71 [01:18<00:00,  1.11s/it, loss=1]    
Validation epoch [4/10]: 100%|██████████| 8/8 [00:03<00:00,  2.38it/s]


Spearman CorrCoef: 0.8269213438034058
Accuracy: 0.8140000104904175
F1 Score: 0.8212239146232605


Training epoch [5/10]: 100%|██████████| 71/71 [01:19<00:00,  1.11s/it, loss=0.364]
Validation epoch [5/10]: 100%|██████████| 8/8 [00:03<00:00,  2.39it/s]


Spearman CorrCoef: 0.8437114953994751
Accuracy: 0.8579999804496765
F1 Score: 0.8577326536178589


Training epoch [7/10]: 100%|██████████| 71/71 [01:18<00:00,  1.11s/it, loss=0.129]
Validation epoch [7/10]: 100%|██████████| 8/8 [00:03<00:00,  2.39it/s]


Spearman CorrCoef: 0.8425440192222595
Accuracy: 0.8740000128746033
F1 Score: 0.86925208568573


Training epoch [8/10]: 100%|██████████| 71/71 [01:18<00:00,  1.11s/it, loss=0.203]
Validation epoch [8/10]: 100%|██████████| 8/8 [00:03<00:00,  2.32it/s]


Spearman CorrCoef: 0.8413304686546326
Accuracy: 0.8579999804496765
F1 Score: 0.8502113819122314


Training epoch [9/10]: 100%|██████████| 71/71 [01:18<00:00,  1.11s/it, loss=0.16] 
Validation epoch [9/10]: 100%|██████████| 8/8 [00:03<00:00,  2.40it/s]


Spearman CorrCoef: 0.8241739273071289
Accuracy: 0.8640000224113464
F1 Score: 0.8550244569778442


Training epoch [10/10]: 100%|██████████| 71/71 [01:19<00:00,  1.11s/it, loss=0.152]
Validation epoch [10/10]: 100%|██████████| 8/8 [00:03<00:00,  2.39it/s]


Spearman CorrCoef: 0.8274838328361511
Accuracy: 0.8579999804496765
F1 Score: 0.8500612378120422


In [17]:
# Test the model on the test set
pbar = tqdm(dl_test)
for inputs in pbar:
    # Evaluate your model
    outputs_reg, outputs_cls = None, None
    with torch.no_grad():
        outputs_reg, outputs_cls = model(**inputs)
    # Output all the evaluation scores (SpearmanCorrCoef, Accuracy, F1Score)
    pred_reg = outputs_reg
    pred_cls = outputs_cls.argmax(dim=1)
    spc.update(pred_reg, inputs['labels_reg'])
    acc.update(pred_cls, inputs['labels_cls'])
    f1.update(pred_cls, inputs['labels_cls'])

# Print the evaluation scores
print(f'Spearman CorrCoef: {spc.compute()}')
print(f'Accuracy: {acc.compute()}')
print(f'F1 Score: {f1.compute()}')

# Reset the evaluation metrics
spc.reset()
acc.reset()
f1.reset()

100%|██████████| 77/77 [00:29<00:00,  2.59it/s]

Spearman CorrCoef: 0.8421215415000916
Accuracy: 0.8605642318725586
F1 Score: 0.8504670858383179



