In [2]:
import transformers as T
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from torchmetrics import SpearmanCorrCoef, Accuracy, F1Score

In [3]:
token_replacement = [
    ["：" , ":"],
    ["，" , ","],
    ["“" , "\""],
    ["”" , "\""],
    ["？" , "?"],
    ["……" , "..."],
    ["！" , "!"]
]

In [4]:
class SemevalDataset(Dataset):
    def __init__(self, split="train") -> None:
        super().__init__()
        assert split in ["train", "validation", "test"]
        self.data = load_dataset(
            "sem_eval_2014_task_1", split=split, cache_dir="./cache/", trust_remote_code=True
        ).to_list()

    def __getitem__(self, index):
        d = self.data[index]
        # Token replacement
        for k in ["premise", "hypothesis"]:
            for tok in token_replacement:
                d[k] = d[k].replace(tok[0], tok[1])
        return d

    def __len__(self):
        return len(self.data)

data_sample = SemevalDataset(split="train").data[:3]
print(f"Dataset example: \n{data_sample[0]} \n{data_sample[1]} \n{data_sample[2]}")

Downloading builder script:   0%|          | 0.00/5.20k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.56k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/87.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/93.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4927 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset example: 
{'sentence_pair_id': 1, 'premise': 'A group of kids is playing in a yard and an old man is standing in the background', 'hypothesis': 'A group of boys in a yard is playing and a man is standing in the background', 'relatedness_score': 4.5, 'entailment_judgment': 0} 
{'sentence_pair_id': 2, 'premise': 'A group of children is playing in the house and there is no man standing in the background', 'hypothesis': 'A group of kids is playing in a yard and an old man is standing in the background', 'relatedness_score': 3.200000047683716, 'entailment_judgment': 0} 
{'sentence_pair_id': 3, 'premise': 'The young boys are playing outdoors and the man is smiling nearby', 'hypothesis': 'The kids are playing outdoors near a man with a smile', 'relatedness_score': 4.699999809265137, 'entailment_judgment': 1}


In [5]:
# Define the hyperparameters
lr = 3e-5
epochs = 10
train_batch_size = 64
validation_batch_size = 64
test_batch_size = 64

In [6]:
tokenizer = T.BertTokenizer.from_pretrained("google-bert/bert-base-uncased", cache_dir="./cache/")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device = "mps" if torch.backends.mps.is_available() else "cpu"

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
# TODO1: Create batched data for DataLoader
# `collate_fn` is a function that defines how the data batch should be packed.
# This function will be called in the DataLoader to pack the data batch.

def collate_fn(batch):
    # TODO1-1: Implement the collate_fn function
    # Write your code here
    # The input parameter is a data batch (tuple), and this function packs it into tensors.
    # Use tokenizer to pack tokens and pack the data and its corresponding labels.
    # Return the data batch and labels for each sub-task.
    complete_text = [
        f"{example['premise']}[SEP]{example['hypothesis']}"
        for example in batch
    ]
    complete_text = tokenizer.batch_encode_plus(
        complete_text,
        padding=True,
        truncation=True,
        return_tensors="pt",
        add_special_tokens=False,
    )
    complete_text['labels_reg'] = torch.tensor([b['relatedness_score'] for b in batch])
    complete_text['labels_cls'] = torch.tensor([b['entailment_judgment'] for b in batch])
    
    # Move the data to the device
    complete_text = {k: complete_text[k].to(device) for k in complete_text}
    
    return complete_text

# TODO1-2: Define your DataLoader
ds_train = SemevalDataset("train")
ds_validation = SemevalDataset("validation")

dl_train = DataLoader(ds_train, batch_size=train_batch_size, shuffle=True, collate_fn=collate_fn)
dl_validation = DataLoader(ds_validation, batch_size=validation_batch_size, shuffle=False, collate_fn=collate_fn)

ds_test = SemevalDataset("test")
dl_test = DataLoader(ds_test, batch_size=test_batch_size, shuffle=False, collate_fn=collate_fn)

In [40]:
# TODO2: Construct your model
class MultiLabelModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # Write your code here
        # Define what modules you will use in the model
        self.bert = T.BertModel.from_pretrained("google-bert/bert-base-uncased", cache_dir="./cache/")
        self.regression_head = torch.nn.Sequential(
            torch.nn.Linear(768, 384),
            torch.nn.ReLU(),
            torch.nn.Linear(384, 192),
            torch.nn.ReLU(),
            torch.nn.Linear(192, 1)
        )
        self.classification_head = torch.nn.Sequential(
            torch.nn.Linear(768, 384),
            torch.nn.ReLU(),
            torch.nn.Linear(384, 192),
            torch.nn.ReLU(),
            torch.nn.Linear(192, 3)
        )
        
    def forward(self, **inputs):
        # Write your code here
        # Forward pass
        # BERT
        bert_output = self.bert(inputs['input_ids'], inputs['attention_mask'], inputs['token_type_ids'])
        # [CLS] token hidden state
        cls_token_output = bert_output.last_hidden_state[:, 0, :]
        # Task-specific heads
        reg_output = self.regression_head(cls_token_output)
        cls_output = self.classification_head(cls_token_output)

        return reg_output.squeeze(-1), cls_output

In [41]:
# Use both GPUs
model = MultiLabelModel()

for param in model.bert.parameters():
    param.requires_grad = False
for param in model.bert.encoder.layer[-3].parameters():
    param.requires_grad = True

In [42]:
# TODO3: Define your optimizer and loss function

# TODO3-1: Define your Optimizer
# optimizer = torch.optim.AdamW(model.parameters(), lr=lr)# Write your code here
optimizer = torch.optim.AdamW([
    {'params': model.bert.parameters(), 'lr': 3e-5},
    {'params': model.regression_head.parameters(), 'lr': 3e-3},
    {'params': model.classification_head.parameters(), 'lr': 3e-3}
])

# TODO3-2: Define your loss functions (you should have two)
# Write your code here
criterion_reg = torch.nn.MSELoss()  # Regression loss
criterion_cls = torch.nn.CrossEntropyLoss()  # Classification loss

# scoring functions
spc = SpearmanCorrCoef().to(device)
acc = Accuracy(task="multiclass", num_classes=3).to(device)
f1 = F1Score(task="multiclass", num_classes=3, average='macro').to(device)

In [43]:
# This is the sample code from Pytorch
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = torch.nn.DataParallel(model)
    
model = model.to(device)

Let's use 2 GPUs!


In [11]:
%mkdir ./saved_models

In [44]:
for ep in range(epochs):
    pbar = tqdm(dl_train)
    pbar.set_description(f"Training epoch [{ep+1}/{epochs}]")
    model.train()
    # TODO4: Write the training loop
    # Write your code here
    for inputs in pbar:
        # train your model
        # clear gradient
        optimizer.zero_grad()
        # forward pass
        outputs_reg, outputs_cls = model(**inputs)
        # compute loss
        loss_reg = criterion_reg(outputs_reg, inputs['labels_reg'])
        loss_cls = criterion_cls(outputs_cls, inputs['labels_cls'])
        # back-propagation
        loss = loss_reg + loss_cls
        loss.backward()
        # model optimization
        optimizer.step()
        # update progress bar
        pbar.set_postfix(loss=loss.item())

    pbar = tqdm(dl_validation)
    pbar.set_description(f"Validation epoch [{ep+1}/{epochs}]")
    model.eval()
    # TODO5: Write the evaluation loop
    # Write your code here
    for inputs in pbar:
        # Evaluate your model
        outputs_reg, outputs_cls = None, None
        with torch.no_grad():
            outputs_reg, outputs_cls = model(**inputs)
        # Output all the evaluation scores (SpearmanCorrCoef, Accuracy, F1Score)
        pred_reg = outputs_reg
        pred_cls = outputs_cls.argmax(dim=1)
        spc.update(pred_reg, inputs['labels_reg'])
        acc.update(pred_cls, inputs['labels_cls'])
        f1.update(pred_cls, inputs['labels_cls'])

    # Print the evaluation scores
    print(f'Spearman CorrCoef: {spc.compute()}')
    print(f'Accuracy: {acc.compute()}')
    print(f'F1 Score: {f1.compute()}')
    # Reset the evaluation metrics
    spc.reset()
    acc.reset()
    f1.reset()
    # Save the model
    torch.save(model, f'./saved_models/ep{ep}.ckpt')

Training epoch [1/10]: 100%|██████████| 71/71 [00:15<00:00,  4.64it/s, loss=1.19]
Validation epoch [1/10]: 100%|██████████| 8/8 [00:01<00:00,  7.26it/s]


Spearman CorrCoef: 0.6994675993919373
Accuracy: 0.777999997138977
F1 Score: 0.7840260863304138


Training epoch [2/10]: 100%|██████████| 71/71 [00:15<00:00,  4.49it/s, loss=0.919]
Validation epoch [2/10]: 100%|██████████| 8/8 [00:01<00:00,  7.21it/s]


Spearman CorrCoef: 0.7476140260696411
Accuracy: 0.734000027179718
F1 Score: 0.7434486150741577


Training epoch [3/10]: 100%|██████████| 71/71 [00:15<00:00,  4.61it/s, loss=0.995]
Validation epoch [3/10]: 100%|██████████| 8/8 [00:01<00:00,  7.56it/s]


Spearman CorrCoef: 0.7704224586486816
Accuracy: 0.7799999713897705
F1 Score: 0.7648062109947205


Training epoch [4/10]: 100%|██████████| 71/71 [00:15<00:00,  4.66it/s, loss=0.946]
Validation epoch [4/10]: 100%|██████████| 8/8 [00:01<00:00,  7.47it/s]


Spearman CorrCoef: 0.7772604823112488
Accuracy: 0.7839999794960022
F1 Score: 0.790701150894165


Training epoch [5/10]: 100%|██████████| 71/71 [00:15<00:00,  4.73it/s, loss=0.887]
Validation epoch [5/10]: 100%|██████████| 8/8 [00:01<00:00,  7.59it/s]


Spearman CorrCoef: 0.7814351320266724
Accuracy: 0.800000011920929
F1 Score: 0.8063952326774597


Training epoch [6/10]: 100%|██████████| 71/71 [00:15<00:00,  4.62it/s, loss=1.04] 
Validation epoch [6/10]: 100%|██████████| 8/8 [00:01<00:00,  7.44it/s]


Spearman CorrCoef: 0.7861291766166687
Accuracy: 0.8240000009536743
F1 Score: 0.822240948677063


Training epoch [7/10]: 100%|██████████| 71/71 [00:15<00:00,  4.63it/s, loss=0.65] 
Validation epoch [7/10]: 100%|██████████| 8/8 [00:01<00:00,  7.50it/s]


Spearman CorrCoef: 0.7843605875968933
Accuracy: 0.7940000295639038
F1 Score: 0.8052045702934265


Training epoch [8/10]: 100%|██████████| 71/71 [00:15<00:00,  4.61it/s, loss=0.652]
Validation epoch [8/10]: 100%|██████████| 8/8 [00:01<00:00,  7.52it/s]


Spearman CorrCoef: 0.7901216149330139
Accuracy: 0.8220000267028809
F1 Score: 0.8243188261985779


Training epoch [9/10]: 100%|██████████| 71/71 [00:15<00:00,  4.69it/s, loss=0.658]
Validation epoch [9/10]: 100%|██████████| 8/8 [00:01<00:00,  7.53it/s]


Spearman CorrCoef: 0.7945638298988342
Accuracy: 0.8259999752044678
F1 Score: 0.8260958194732666


Training epoch [10/10]: 100%|██████████| 71/71 [00:15<00:00,  4.65it/s, loss=0.713]
Validation epoch [10/10]: 100%|██████████| 8/8 [00:01<00:00,  7.50it/s]


Spearman CorrCoef: 0.7993440628051758
Accuracy: 0.828000009059906
F1 Score: 0.8264760375022888


In [45]:
# Test the model on the test set
pbar = tqdm(dl_test)
for inputs in pbar:
    # Evaluate your model
    outputs_reg, outputs_cls = None, None
    with torch.no_grad():
        outputs_reg, outputs_cls = model(**inputs)
    # Output all the evaluation scores (SpearmanCorrCoef, Accuracy, F1Score)
    pred_reg = outputs_reg
    pred_cls = outputs_cls.argmax(dim=1)
    spc.update(pred_reg, inputs['labels_reg'])
    acc.update(pred_cls, inputs['labels_cls'])
    f1.update(pred_cls, inputs['labels_cls'])

# Print the evaluation scores
print(f'Spearman CorrCoef: {spc.compute()}')
print(f'Accuracy: {acc.compute()}')
print(f'F1 Score: {f1.compute()}')

# Reset the evaluation metrics
spc.reset()
acc.reset()
f1.reset()

100%|██████████| 77/77 [00:09<00:00,  8.19it/s]

Spearman CorrCoef: 0.7700918912887573
Accuracy: 0.8116500973701477
F1 Score: 0.8086235523223877



