In [1]:
import transformers as T
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from torchmetrics import SpearmanCorrCoef, Accuracy, F1Score

In [2]:
token_replacement = [
    ["：" , ":"],
    ["，" , ","],
    ["“" , "\""],
    ["”" , "\""],
    ["？" , "?"],
    ["……" , "..."],
    ["！" , "!"]
]

In [3]:
class SemevalDataset(Dataset):
    def __init__(self, split="train") -> None:
        super().__init__()
        assert split in ["train", "validation", "test"]
        self.data = load_dataset(
            "sem_eval_2014_task_1", split=split, cache_dir="./cache/", trust_remote_code=True
        ).to_list()

    def __getitem__(self, index):
        d = self.data[index]
        # Token replacement
        for k in ["premise", "hypothesis"]:
            for tok in token_replacement:
                d[k] = d[k].replace(tok[0], tok[1])
        return d

    def __len__(self):
        return len(self.data)

data_sample = SemevalDataset(split="train").data[:3]
print(f"Dataset example: \n{data_sample[0]} \n{data_sample[1]} \n{data_sample[2]}")

Downloading builder script:   0%|          | 0.00/5.20k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.56k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/87.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/93.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4927 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset example: 
{'sentence_pair_id': 1, 'premise': 'A group of kids is playing in a yard and an old man is standing in the background', 'hypothesis': 'A group of boys in a yard is playing and a man is standing in the background', 'relatedness_score': 4.5, 'entailment_judgment': 0} 
{'sentence_pair_id': 2, 'premise': 'A group of children is playing in the house and there is no man standing in the background', 'hypothesis': 'A group of kids is playing in a yard and an old man is standing in the background', 'relatedness_score': 3.200000047683716, 'entailment_judgment': 0} 
{'sentence_pair_id': 3, 'premise': 'The young boys are playing outdoors and the man is smiling nearby', 'hypothesis': 'The kids are playing outdoors near a man with a smile', 'relatedness_score': 4.699999809265137, 'entailment_judgment': 1}


In [4]:
# Define the hyperparameters
lr = 3e-5
epochs = 30
train_batch_size = 64
validation_batch_size = 64
test_batch_size = 64

In [5]:
tokenizer = T.BertTokenizer.from_pretrained("google-bert/bert-base-uncased", cache_dir="./cache/")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device = "mps" if torch.backends.mps.is_available() else "cpu"

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
# TODO1: Create batched data for DataLoader
# `collate_fn` is a function that defines how the data batch should be packed.
# This function will be called in the DataLoader to pack the data batch.

def collate_fn(batch):
    # TODO1-1: Implement the collate_fn function
    # Write your code here
    # The input parameter is a data batch (tuple), and this function packs it into tensors.
    # Use tokenizer to pack tokens and pack the data and its corresponding labels.
    # Return the data batch and labels for each sub-task.
    complete_text = [
        f"{example['premise']}[SEP]{example['hypothesis']}"
        for example in batch
    ]
    complete_text = tokenizer.batch_encode_plus(
        complete_text,
        padding=True,
        truncation=True,
        return_tensors="pt",
        add_special_tokens=False,
    )
    complete_text['labels_reg'] = torch.tensor([b['relatedness_score'] for b in batch])
    complete_text['labels_cls'] = torch.tensor([b['entailment_judgment'] for b in batch])
    
    # Move the data to the device
    complete_text = {k: complete_text[k].to(device) for k in complete_text}
    
    return complete_text

# TODO1-2: Define your DataLoader
ds_train = SemevalDataset("train")
ds_validation = SemevalDataset("validation")

dl_train = DataLoader(ds_train, batch_size=train_batch_size, shuffle=True, collate_fn=collate_fn)
dl_validation = DataLoader(ds_validation, batch_size=validation_batch_size, shuffle=False, collate_fn=collate_fn)

ds_test = SemevalDataset("test")
dl_test = DataLoader(ds_test, batch_size=test_batch_size, shuffle=False, collate_fn=collate_fn)

In [7]:
# TODO2: Construct your model
class MultiLabelModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # Write your code here
        # Define what modules you will use in the model
        self.bert = T.BertModel.from_pretrained("google-bert/bert-base-uncased", cache_dir="./cache/")
        self.regression_head = torch.nn.Sequential(
            torch.nn.Linear(768, 384),
            torch.nn.ReLU(),
            torch.nn.Linear(384, 192),
            torch.nn.ReLU(),
            torch.nn.Linear(192, 1)
        )
        self.classification_head = torch.nn.Sequential(
            torch.nn.Linear(768, 384),
            torch.nn.ReLU(),
            torch.nn.Linear(384, 192),
            torch.nn.ReLU(),
            torch.nn.Linear(192, 3)
        )
        
    def forward(self, **inputs):
        # Write your code here
        # Forward pass
        # BERT
        bert_output = self.bert(inputs['input_ids'], inputs['attention_mask'], inputs['token_type_ids'])
        # [CLS]'s final hidden state through BertPooler
        pooler_output = bert_output.pooler_output 
        # Task-specific heads
        reg_output = self.regression_head(pooler_output)
        cls_output = self.classification_head(pooler_output)

        return reg_output.squeeze(-1), cls_output

In [8]:
# Use both GPUs
model = MultiLabelModel()
# This is the sample code from Pytorch
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = torch.nn.DataParallel(model)
    
model = model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Let's use 2 GPUs!


In [9]:
# TODO3: Define your optimizer and loss function

# TODO3-1: Define your Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)# Write your code here

# TODO3-2: Define your loss functions (you should have two)
# Write your code here
criterion_reg = torch.nn.MSELoss()  # Regression loss
criterion_cls = torch.nn.CrossEntropyLoss()  # Classification loss

# scoring functions
spc = SpearmanCorrCoef().to(device)
acc = Accuracy(task="multiclass", num_classes=3).to(device)
f1 = F1Score(task="multiclass", num_classes=3, average='macro').to(device)



In [10]:
%mkdir ./saved_models

In [11]:
for ep in range(epochs):
    pbar = tqdm(dl_train)
    pbar.set_description(f"Training epoch [{ep+1}/{epochs}]")
    model.train()
    # TODO4: Write the training loop
    # Write your code here
    for inputs in pbar:
        # train your model
        # clear gradient
        optimizer.zero_grad()
        # forward pass
        outputs_reg, outputs_cls = model(**inputs)
        # compute loss
        loss_reg = criterion_reg(outputs_reg, inputs['labels_reg'])
        loss_cls = criterion_cls(outputs_cls, inputs['labels_cls'])
        # back-propagation
        loss = loss_reg + loss_cls
        loss.backward()
        # model optimization
        optimizer.step()
        # update progress bar
        pbar.set_postfix(loss=loss.item())

    pbar = tqdm(dl_validation)
    pbar.set_description(f"Validation epoch [{ep+1}/{epochs}]")
    model.eval()
    # TODO5: Write the evaluation loop
    # Write your code here
    for inputs in pbar:
        # Evaluate your model
        outputs_reg, outputs_cls = None, None
        with torch.no_grad():
            outputs_reg, outputs_cls = model(**inputs)
        # Output all the evaluation scores (SpearmanCorrCoef, Accuracy, F1Score)
        pred_reg = outputs_reg
        pred_cls = outputs_cls.argmax(dim=1)
        spc.update(pred_reg, inputs['labels_reg'])
        acc.update(pred_cls, inputs['labels_cls'])
        f1.update(pred_cls, inputs['labels_cls'])

    # Print the evaluation scores
    print(f'Spearman CorrCoef: {spc.compute()}')
    print(f'Accuracy: {acc.compute()}')
    print(f'F1 Score: {f1.compute()}')
    # Reset the evaluation metrics
    spc.reset()
    acc.reset()
    f1.reset()
    # Save the model
    torch.save(model, f'./saved_models/ep{ep}.ckpt')

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Training epoch [1/30]: 100%|██████████| 71/71 [00:25<00:00,  2.80it/s, loss=2.03]
Validation epoch [1/30]: 100%|██████████| 8/8 [00:01<00:00,  7.27it/s]


Spearman CorrCoef: 0.5174302458763123
Accuracy: 0.5640000104904175
F1 Score: 0.24040921032428741


Training epoch [2/30]: 100%|██████████| 71/71 [00:23<00:00,  2.98it/s, loss=1.2] 
Validation epoch [2/30]: 100%|██████████| 8/8 [00:00<00:00,  8.00it/s]


Spearman CorrCoef: 0.5158592462539673
Accuracy: 0.671999990940094
F1 Score: 0.46521198749542236


Training epoch [3/30]: 100%|██████████| 71/71 [00:24<00:00,  2.90it/s, loss=0.743]
Validation epoch [3/30]: 100%|██████████| 8/8 [00:01<00:00,  7.86it/s]


Spearman CorrCoef: 0.7900315523147583
Accuracy: 0.7239999771118164
F1 Score: 0.5047472715377808


Training epoch [4/30]: 100%|██████████| 71/71 [00:24<00:00,  2.90it/s, loss=0.638]
Validation epoch [4/30]: 100%|██████████| 8/8 [00:01<00:00,  7.90it/s]


Spearman CorrCoef: 0.8073567152023315
Accuracy: 0.8539999723434448
F1 Score: 0.8534607887268066


Training epoch [5/30]: 100%|██████████| 71/71 [00:24<00:00,  2.87it/s, loss=0.356]
Validation epoch [5/30]: 100%|██████████| 8/8 [00:01<00:00,  7.91it/s]


Spearman CorrCoef: 0.8047581911087036
Accuracy: 0.8500000238418579
F1 Score: 0.8487604856491089


Training epoch [6/30]: 100%|██████████| 71/71 [00:25<00:00,  2.84it/s, loss=0.399]
Validation epoch [6/30]: 100%|██████████| 8/8 [00:01<00:00,  7.77it/s]


Spearman CorrCoef: 0.8064537048339844
Accuracy: 0.8399999737739563
F1 Score: 0.8386610150337219


Training epoch [7/30]: 100%|██████████| 71/71 [00:24<00:00,  2.84it/s, loss=0.22] 
Validation epoch [7/30]: 100%|██████████| 8/8 [00:01<00:00,  7.74it/s]


Spearman CorrCoef: 0.8082238435745239
Accuracy: 0.8519999980926514
F1 Score: 0.845921516418457


Training epoch [8/30]: 100%|██████████| 71/71 [00:25<00:00,  2.81it/s, loss=0.158]
Validation epoch [8/30]: 100%|██████████| 8/8 [00:01<00:00,  7.80it/s]


Spearman CorrCoef: 0.7706339359283447
Accuracy: 0.8339999914169312
F1 Score: 0.8308650255203247


Training epoch [9/30]: 100%|██████████| 71/71 [00:25<00:00,  2.79it/s, loss=0.463]
Validation epoch [9/30]: 100%|██████████| 8/8 [00:01<00:00,  7.70it/s]


Spearman CorrCoef: 0.8055596947669983
Accuracy: 0.8500000238418579
F1 Score: 0.8457797765731812


Training epoch [10/30]: 100%|██████████| 71/71 [00:25<00:00,  2.78it/s, loss=0.443] 
Validation epoch [10/30]: 100%|██████████| 8/8 [00:01<00:00,  7.68it/s]


Spearman CorrCoef: 0.7985201478004456
Accuracy: 0.828000009059906
F1 Score: 0.8273725509643555


Training epoch [11/30]: 100%|██████████| 71/71 [00:25<00:00,  2.77it/s, loss=0.163]
Validation epoch [11/30]: 100%|██████████| 8/8 [00:01<00:00,  7.64it/s]


Spearman CorrCoef: 0.8080481886863708
Accuracy: 0.8500000238418579
F1 Score: 0.8407392501831055


Training epoch [12/30]: 100%|██████████| 71/71 [00:25<00:00,  2.79it/s, loss=0.0719]
Validation epoch [12/30]: 100%|██████████| 8/8 [00:01<00:00,  7.65it/s]


Spearman CorrCoef: 0.8052209615707397
Accuracy: 0.8479999899864197
F1 Score: 0.845882773399353


Training epoch [13/30]: 100%|██████████| 71/71 [00:25<00:00,  2.75it/s, loss=0.0972]
Validation epoch [13/30]: 100%|██████████| 8/8 [00:01<00:00,  7.68it/s]


Spearman CorrCoef: 0.8106846809387207
Accuracy: 0.8560000061988831
F1 Score: 0.8526843786239624


Training epoch [14/30]: 100%|██████████| 71/71 [00:25<00:00,  2.74it/s, loss=0.0748]
Validation epoch [14/30]: 100%|██████████| 8/8 [00:01<00:00,  7.57it/s]


Spearman CorrCoef: 0.8090195059776306
Accuracy: 0.8479999899864197
F1 Score: 0.8472745418548584


Training epoch [15/30]: 100%|██████████| 71/71 [00:25<00:00,  2.76it/s, loss=0.0998]
Validation epoch [15/30]: 100%|██████████| 8/8 [00:01<00:00,  7.55it/s]


Spearman CorrCoef: 0.8183128833770752
Accuracy: 0.8600000143051147
F1 Score: 0.8575890064239502


Training epoch [16/30]: 100%|██████████| 71/71 [00:25<00:00,  2.75it/s, loss=0.186] 
Validation epoch [16/30]: 100%|██████████| 8/8 [00:01<00:00,  7.70it/s]


Spearman CorrCoef: 0.8147952556610107
Accuracy: 0.8420000076293945
F1 Score: 0.846603274345398


Training epoch [17/30]: 100%|██████████| 71/71 [00:25<00:00,  2.76it/s, loss=0.086] 
Validation epoch [17/30]: 100%|██████████| 8/8 [00:01<00:00,  7.66it/s]


Spearman CorrCoef: 0.7984200119972229
Accuracy: 0.8500000238418579
F1 Score: 0.8483991622924805


Training epoch [18/30]: 100%|██████████| 71/71 [00:25<00:00,  2.74it/s, loss=0.0487]
Validation epoch [18/30]: 100%|██████████| 8/8 [00:01<00:00,  7.66it/s]


Spearman CorrCoef: 0.8187697529792786
Accuracy: 0.8560000061988831
F1 Score: 0.8555487394332886


Training epoch [19/30]: 100%|██████████| 71/71 [00:25<00:00,  2.75it/s, loss=0.0685]
Validation epoch [19/30]: 100%|██████████| 8/8 [00:01<00:00,  7.68it/s]


Spearman CorrCoef: 0.8223072290420532
Accuracy: 0.8619999885559082
F1 Score: 0.8617044687271118


Training epoch [20/30]: 100%|██████████| 71/71 [00:25<00:00,  2.75it/s, loss=0.0422]
Validation epoch [20/30]: 100%|██████████| 8/8 [00:01<00:00,  7.67it/s]


Spearman CorrCoef: 0.8312538862228394
Accuracy: 0.8600000143051147
F1 Score: 0.8595980405807495


Training epoch [21/30]: 100%|██████████| 71/71 [00:25<00:00,  2.74it/s, loss=0.0489]
Validation epoch [21/30]: 100%|██████████| 8/8 [00:01<00:00,  7.64it/s]


Spearman CorrCoef: 0.8138594031333923
Accuracy: 0.8560000061988831
F1 Score: 0.8548718690872192


Training epoch [22/30]: 100%|██████████| 71/71 [00:25<00:00,  2.75it/s, loss=0.0901]
Validation epoch [22/30]: 100%|██████████| 8/8 [00:01<00:00,  7.62it/s]


Spearman CorrCoef: 0.819745659828186
Accuracy: 0.871999979019165
F1 Score: 0.8692258596420288


Training epoch [23/30]: 100%|██████████| 71/71 [00:25<00:00,  2.75it/s, loss=0.0442]
Validation epoch [23/30]: 100%|██████████| 8/8 [00:01<00:00,  7.57it/s]


Spearman CorrCoef: 0.8139817118644714
Accuracy: 0.8500000238418579
F1 Score: 0.8476698398590088


Training epoch [24/30]: 100%|██████████| 71/71 [00:25<00:00,  2.74it/s, loss=0.0227]
Validation epoch [24/30]: 100%|██████████| 8/8 [00:01<00:00,  7.58it/s]


Spearman CorrCoef: 0.810484766960144
Accuracy: 0.8420000076293945
F1 Score: 0.8432274460792542


Training epoch [25/30]: 100%|██████████| 71/71 [00:25<00:00,  2.77it/s, loss=0.0961]
Validation epoch [25/30]: 100%|██████████| 8/8 [00:01<00:00,  7.64it/s]


Spearman CorrCoef: 0.8117568492889404
Accuracy: 0.8600000143051147
F1 Score: 0.8596039414405823


Training epoch [26/30]: 100%|██████████| 71/71 [00:25<00:00,  2.75it/s, loss=0.281] 
Validation epoch [26/30]: 100%|██████████| 8/8 [00:01<00:00,  7.61it/s]


Spearman CorrCoef: 0.8095592260360718
Accuracy: 0.8500000238418579
F1 Score: 0.8447922468185425


Training epoch [27/30]: 100%|██████████| 71/71 [00:25<00:00,  2.76it/s, loss=0.0485]
Validation epoch [27/30]: 100%|██████████| 8/8 [00:01<00:00,  7.54it/s]


Spearman CorrCoef: 0.8176103234291077
Accuracy: 0.8560000061988831
F1 Score: 0.8523151874542236


Training epoch [28/30]: 100%|██████████| 71/71 [00:25<00:00,  2.74it/s, loss=0.052] 
Validation epoch [28/30]: 100%|██████████| 8/8 [00:01<00:00,  7.66it/s]


Spearman CorrCoef: 0.8096445202827454
Accuracy: 0.8700000047683716
F1 Score: 0.8701274394989014


Training epoch [29/30]: 100%|██████████| 71/71 [00:25<00:00,  2.75it/s, loss=0.105] 
Validation epoch [29/30]: 100%|██████████| 8/8 [00:01<00:00,  7.61it/s]


Spearman CorrCoef: 0.808165967464447
Accuracy: 0.8479999899864197
F1 Score: 0.8495544195175171


Training epoch [30/30]: 100%|██████████| 71/71 [00:25<00:00,  2.77it/s, loss=0.0541]
Validation epoch [30/30]: 100%|██████████| 8/8 [00:01<00:00,  7.54it/s]


Spearman CorrCoef: 0.7929942011833191
Accuracy: 0.8659999966621399
F1 Score: 0.8641027212142944


In [17]:
# Test the model on the test set
pbar = tqdm(dl_test)
for inputs in pbar:
    # Evaluate your model
    outputs_reg, outputs_cls = None, None
    with torch.no_grad():
        outputs_reg, outputs_cls = model(**inputs)
    # Output all the evaluation scores (SpearmanCorrCoef, Accuracy, F1Score)
    pred_reg = outputs_reg
    pred_cls = outputs_cls.argmax(dim=1)
    spc.update(pred_reg, inputs['labels_reg'])
    acc.update(pred_cls, inputs['labels_cls'])
    f1.update(pred_cls, inputs['labels_cls'])

# Print the evaluation scores
print(f'Spearman CorrCoef: {spc.compute()}')
print(f'Accuracy: {acc.compute()}')
print(f'F1 Score: {f1.compute()}')

# Reset the evaluation metrics
spc.reset()
acc.reset()
f1.reset()

100%|██████████| 77/77 [00:09<00:00,  8.54it/s]

Spearman CorrCoef: 0.8046234846115112
Accuracy: 0.864623486995697
F1 Score: 0.8583522439002991



