<a href="https://colab.research.google.com/github/Ha1ion/2025_NLP_HW3/blob/main/nlp_hw3_gpt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

此作業有使用Gemini幫忙下註解幫助批改

In [1]:
!pip install "datasets==2.18.0"

Collecting datasets==2.18.0
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow-hotfix (from datasets==2.18.0)
  Downloading pyarrow_hotfix-0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting fsspec<=2024.2.0,>=2023.1.0 (from fsspec[http]<=2024.2.0,>=2023.1.0->datasets==2.18.0)
  Downloading fsspec-2024.2.0-py3-none-any.whl.metadata (6.8 kB)
Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.2.0-py3-none-any.whl (170 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.9/170.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow_hotfix-0.7-py3-none-any.whl (7.9 kB)
Installing collected packages: pyarrow-hotfix, fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.0
    Uninstalling fsspec-2025.3.0:
      Successfully uninstalled 

In [2]:
!pip install transformers datasets evaluate
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel, GPT2Tokenizer, GPT2Model
from datasets import load_dataset
from evaluate import load
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
device = "cuda" if torch.cuda.is_available() else "cpu"
#  You can install and import any other libraries if needed

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [3]:
# Some Chinese punctuations will be tokenized as [UNK], so we replace them with English ones
token_replacement = [
    ["：" , ":"],
    ["，" , ","],
    ["“" , "\""],
    ["”" , "\""],
    ["？" , "?"],
    ["……" , "..."],
    ["！" , "!"]
]

In [None]:
# 原本的:
# tokenizer = RobertaTokenizer.from_pretrained("roberta-base", cache_dir="./cache/")

# 修改為:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", cache_dir="./cache/")
# GPT-2 預設沒有 padding token，我們將 EOS (End-of-Sentence) token 設為 padding token
tokenizer.pad_token = tokenizer.eos_token

In [None]:
class SemevalDataset(Dataset):
    def __init__(self, split="train") -> None:
        super().__init__()
        assert split in ["train", "validation", "test"]
        self.data = load_dataset(
            "sem_eval_2014_task_1", split=split, trust_remote_code=True, cache_dir="./cache/"
        ).to_list()

    def __getitem__(self, index):
        d = self.data[index]
        # Replace Chinese punctuations with English ones
        for k in ["premise", "hypothesis"]:
            for tok in token_replacement:
                d[k] = d[k].replace(tok[0], tok[1])
        return d

    def __len__(self):
        return len(self.data)

data_sample = SemevalDataset(split="train").data[:3]
print(f"Dataset example: \n{data_sample[0]} \n{data_sample[1]} \n{data_sample[2]}")

In [6]:
# Define the hyperparameters
# You can modify these values if needed
lr = 3e-5
epochs = 3
train_batch_size = 8
validation_batch_size = 8

In [7]:
# TODO1: Create batched data for DataLoader
# `collate_fn` is a function that defines how the data batch should be packed.
# This function will be called in the DataLoader to pack the data batch.

def collate_fn(batch):
    # TODO1-1: Implement the collate_fn function

    # 1. 從 batch 中分別取出所有 premise 和 hypothesis
    premises = [d['premise'] for d in batch]
    hypotheses = [d['hypothesis'] for d in batch]

    # 2. 使用 tokenizer 處理句子對
    inputs = tokenizer(
        premises,
        hypotheses,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

    # 3. 處理標籤
    # [FIX]: 根據 print 輸出的真實 key

    # Sub-task 1: 使用 'relatedness_score'
    inputs['labels_sim'] = torch.tensor(
        [d['relatedness_score'] for d in batch],
        dtype=torch.float
    )

    # Sub-task 2: 使用 'entailment_judgment' (注意拼寫!)
    inputs['labels_ent'] = torch.tensor(
        [d['entailment_judgment'] for d in batch],
        dtype=torch.long
    )

    return inputs

# TODO1-2: Define your DataLoader
# (這部分的程式碼 dl_train, dl_validation, dl_test 保持不變)

# 1. 建立 Dataset 實例
train_dataset = SemevalDataset(split="train")
validation_dataset = SemevalDataset(split="validation")
test_dataset = SemevalDataset(split="test")

# 2. 建立 DataLoader
dl_train = DataLoader(
    train_dataset,
    batch_size=train_batch_size,
    shuffle=True,
    collate_fn=collate_fn
)

dl_validation = DataLoader(
    validation_dataset,
    batch_size=validation_batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

dl_test = DataLoader(
    test_dataset,
    batch_size=validation_batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

In [8]:
# TODO2: Construct your model
class MultiLabelModel(torch.nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # 1. 載入 GPT-2 模型
        self.bert = GPT2Model.from_pretrained(
            "gpt2",
            cache_dir="./cache/"
        )

        # [!] 告訴 GPT-2 模型我們新的 padding token ID
        self.bert.config.pad_token_id = tokenizer.pad_token_id

        # GPT-2 base 的 hidden size 也是 768
        hidden_size = self.bert.config.hidden_size

        # (輸出頭保持不變)
        self.regression_head = torch.nn.Linear(hidden_size, 1)
        self.classification_head = torch.nn.Linear(hidden_size, 3)

    def forward(self, **kwargs):
        # [!] 這是與 BERT/RoBERTa 最大的不同

        labels_sim = kwargs.pop("labels_sim", None)
        labels_ent = kwargs.pop("labels_ent", None)

        # 1. 將 input_ids 和 attention_mask 傳入 GPT-2
        bert_output = self.bert(**kwargs)

        # 2. 取得 "last_hidden_state" (shape: [batch_size, sequence_length, hidden_size])
        # 我們不再使用 .pooler_output
        last_hidden_state = bert_output.last_hidden_state

        # 3. 取得最後一個 token 的 hidden state
        # 我們使用 .attention_mask 來找到每個序列的實際長度
        # (這是一種更穩健的方式，而不是盲目地取 [:, -1, :])

        # 取得 attention_mask 中 1 的總數 (即序列長度)
        # .sum() 會得到一個 [batch_size] 的 tensor
        sequence_lengths = kwargs['attention_mask'].sum(dim=1) - 1 # 減 1 得到最後一個 token 的 index

        # 建立一個 [batch_size] 的 tensor，用於 torch.gather
        batch_indices = torch.arange(last_hidden_state.size(0), device=last_hidden_state.device)

        # 取得 [batch_size, hidden_size] 的 pooled_output
        # 效果等同於:
        # pooled_output = []
        # for i in range(batch_size):
        #     pooled_output.append(last_hidden_state[i, sequence_lengths[i], :])
        # pooled_output = torch.stack(pooled_output)

        pooled_output = last_hidden_state[batch_indices, sequence_lengths, :]

        # 4. 將這個 "最後 token 的" pooled_output 傳入兩個 head
        logits_sim = self.regression_head(pooled_output)
        logits_ent = self.classification_head(pooled_output)

        return logits_sim, logits_ent

In [None]:
# TODO3: Define your optimizer and loss function

model = MultiLabelModel().to(device)
# TODO3-1: Define your Optimizer
# We use AdamW as recommended by the PDF  and it's standard for Transformers.
optimizer = AdamW(model.parameters(), lr=lr)

# TODO3-2: Define your loss functions (you should have two) [cite: 171]
# Use different loss functions for different types of tasks.

# Sub-task 1 (relatedness_score) is regression, so we use MSELoss.
loss_sim_fn = torch.nn.MSELoss()

# Sub-task 2 (entailment_judgement) is 3-class classification, so we use CrossEntropyLoss.
loss_ent_fn = torch.nn.CrossEntropyLoss()


# scoring functions
psr = load("pearsonr")
acc = load("accuracy")

In [10]:
best_score = 0.0
for ep in range(epochs):
    pbar = tqdm(dl_train)
    pbar.set_description(f"Training epoch [{ep+1}/{epochs}]")
    model.train()
    # TODO4: Write the training loop
    # Write your code here
    # train your model
    # clear gradient
    # forward pass
    # compute loss
    # back-propagation
    # model optimization

    # 初始化 total loss 來追蹤這個 epoch 的平均 loss
    total_train_loss = 0.0

    for batch in pbar:
        # 1. 將資料移動到 device
        batch = {k: v.to(device) for k, v in batch.items()}

        # 2. 取得標籤
        labels_sim = batch['labels_sim']
        labels_ent = batch['labels_ent']

        # 3. clear gradient
        optimizer.zero_grad()

        # 4. forward pass
        # 我們在模型 forward 中已經處理了 **kwargs，所以可以直接傳入 batch
        logits_sim, logits_ent = model(**batch)

        # 5. compute loss
        # 迴歸 loss (記得 squeeze logits_sim 才能匹配 (batch_size,) 的 shape)
        loss_sim = loss_sim_fn(logits_sim.squeeze(), labels_sim)
        # 分類 loss
        loss_ent = loss_ent_fn(logits_ent, labels_ent)

        # 合併兩個 loss
        total_loss = loss_sim + loss_ent

        # 6. back-propagation
        total_loss.backward()

        # 7. model optimization
        optimizer.step()

        total_train_loss += total_loss.item()
        pbar.set_postfix({"loss": total_loss.item()})

    print(f"Epoch {ep+1} Average Train Loss: {total_train_loss / len(dl_train)}")

    pbar = tqdm(dl_validation)
    pbar.set_description(f"Validation epoch [{ep+1}/{epochs}]")
    model.eval()

    # TODO5: Write the evaluation loop
    # Write your code here
    # Evaluate your model
    # Output all the evaluation scores (PearsonCorr, Accuracy)

    # 建立 list 來儲存所有預測和標籤
    all_preds_sim = []
    all_labels_sim = []
    all_preds_ent = []
    all_labels_ent = []

    with torch.no_grad(): # 驗證時不需要計算梯度
        for batch in pbar:
            # 1. 將資料移動到 device
            batch = {k: v.to(device) for k, v in batch.items()}

            # 2. 取得標籤
            labels_sim = batch['labels_sim']
            labels_ent = batch['labels_ent']

            # 3. forward pass
            logits_sim, logits_ent = model(**batch)

            # 4. 處理預測結果
            # 迴歸預測 (squeeze)
            preds_sim = logits_sim.squeeze()
            # 分類預測 (argmax)
            preds_ent = torch.argmax(logits_ent, dim=1)

            # 5. 收集結果 (移回 CPU)
            all_preds_sim.extend(preds_sim.cpu().tolist())
            all_labels_sim.extend(labels_sim.cpu().tolist())
            all_preds_ent.extend(preds_ent.cpu().tolist())
            all_labels_ent.extend(labels_ent.cpu().tolist())

    # 在迴圈結束後，計算整體分數

    # PearsonCorr [cite: 252]
    pearson_corr = psr.compute(
        predictions=all_preds_sim,
        references=all_labels_sim
    )['pearsonr']

    # Accuracy [cite: 253]
    accuracy = acc.compute(
        predictions=all_preds_ent,
        references=all_labels_ent
    )['accuracy']

    print(f"Epoch {ep+1} Validation:")
    print(f"Pearson Correlation: {pearson_corr}")
    print(f"Accuracy: {accuracy}")

    # 儲存最佳模型
    # (修正：範本中的 'best' 變數應為 'best_score')
    current_score = pearson_corr + accuracy
    if current_score > best_score:
        best_score = current_score
        print(f"New best score: {best_score}. Saving model...")
        # 確保 saved_models 資料夾存在
        import os
        os.makedirs("./saved_models", exist_ok=True)
        torch.save(model.state_dict(), f'./saved_models/best_model.ckpt')

Training epoch [1/3]: 100%|██████████| 563/563 [00:55<00:00, 10.06it/s, loss=1.15]


Epoch 1 Average Train Loss: 2.428687002053269


Validation epoch [1/3]: 100%|██████████| 63/63 [00:01<00:00, 51.67it/s]


Epoch 1 Validation:
Pearson Correlation: 0.7389032594092068
Accuracy: 0.692
New best score: 1.4309032594092068. Saving model...


Training epoch [2/3]: 100%|██████████| 563/563 [00:58<00:00,  9.69it/s, loss=0.257]


Epoch 2 Average Train Loss: 1.126928745218111


Validation epoch [2/3]: 100%|██████████| 63/63 [00:01<00:00, 48.35it/s]


Epoch 2 Validation:
Pearson Correlation: 0.8221482314725157
Accuracy: 0.814
New best score: 1.6361482314725158. Saving model...


Training epoch [3/3]: 100%|██████████| 563/563 [00:53<00:00, 10.53it/s, loss=1.68]


Epoch 3 Average Train Loss: 0.8228058981217881


Validation epoch [3/3]: 100%|██████████| 63/63 [00:01<00:00, 52.01it/s]


Epoch 3 Validation:
Pearson Correlation: 0.8439638283140789
Accuracy: 0.858
New best score: 1.7019638283140788. Saving model...


In [11]:
# Load the model
model = MultiLabelModel().to(device)
# 載入我們儲存的最佳模型權重
model.load_state_dict(torch.load(f"./saved_models/best_model.ckpt", weights_only=True))

# Test Loop
pbar = tqdm(dl_test, desc="Test")
model.eval()

# TODO6: Write the test loop
# Write your code here
# We have loaded the best model with the highest evaluation score for you
# Please implement the test loop to evaluate the model on the test dataset
# We will have 10% of the total score for the test accuracy and pearson correlation

# 建立 list 來儲存所有預測和標籤
all_preds_sim = []
all_labels_sim = []
all_preds_ent = []
all_labels_ent = []

with torch.no_grad(): # 測試時不需要計算梯度
    for batch in pbar:
        # 1. 將資料移動到 device
        batch = {k: v.to(device) for k, v in batch.items()}

        # 2. 取得標籤
        labels_sim = batch['labels_sim']
        labels_ent = batch['labels_ent']

        # 3. forward pass
        logits_sim, logits_ent = model(**batch)

        # 4. 處理預測結果
        preds_sim = logits_sim.squeeze()
        preds_ent = torch.argmax(logits_ent, dim=1)

        # 5. 收集結果 (移回 CPU)
        all_preds_sim.extend(preds_sim.cpu().tolist())
        all_labels_sim.extend(labels_sim.cpu().tolist())
        all_preds_ent.extend(preds_ent.cpu().tolist())
        all_labels_ent.extend(labels_ent.cpu().tolist())

# 在迴圈結束後，計算並印出最終的測試分數

# PearsonCorr
test_pearson_corr = psr.compute(
    predictions=all_preds_sim,
    references=all_labels_sim
)['pearsonr']

# Accuracy
test_accuracy = acc.compute(
    predictions=all_preds_ent,
    references=all_labels_ent
)['accuracy']

print("\n--- Test Set Results ---")
print(f"Final Pearson Correlation: {test_pearson_corr}")
print(f"Final Accuracy: {test_accuracy}")

Test: 100%|██████████| 616/616 [00:11<00:00, 53.45it/s]



--- Test Set Results ---
Final Pearson Correlation: 0.8319777644833595
Final Accuracy: 0.8483864420539883
