Cài đặt thư viện

In [None]:
!pip install datasets evaluate sacrebleu rouge_score
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import random
import json
import os
import re
import time
import evaluate
from collections import Counter
import random



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Cấu hình

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

TRAIN_PATH = 'small-train.json'
DEV_PATH   = 'small-dev.json'
TEST_PATH  = 'small-test.json'

HIDDEN_SIZE = 256
N_LAYERS    = 3
DROPOUT     = 0.5
BATCH_SIZE  = 64
LR          = 1e-3
N_EPOCHS    = 30

Xử lý data

In [None]:
class Vocab:
    def __init__(self, frequency_threshold=1):
        self.itos = {0: "<pad>", 1: "<bos>", 2: "<eos>", 3: "<unk>"}
        self.stoi = {"<pad>": 0, "<bos>": 1, "<eos>": 2, "<unk>": 3}
        self.freq_threshold = frequency_threshold
        self.pad_idx = 0
        self.bos_idx = 1
        self.eos_idx = 2
        self.unk_idx = 3

    def __len__(self): return len(self.itos)

    @staticmethod
    def tokenize(text):
        return re.findall(r"\w+|[^\w\s]", text.lower(), re.UNICODE)

    def build_vocab(self, sentence_list):
        frequencies = Counter()
        idx = 4
        for sentence in sentence_list:
            for word in self.tokenize(sentence):
                frequencies[word] += 1
        for word, count in frequencies.items():
            if count >= self.freq_threshold:
                self.stoi[word] = idx
                self.itos[idx] = word
                idx += 1
        return len(self.itos)

    def encode(self, text):
        return [self.stoi.get(token, self.unk_idx) for token in self.tokenize(text)]

class PhoMTDataset(Dataset):
    def __init__(self, json_file, limit=None):
        self.data = []
        if os.path.exists(json_file):
            with open(json_file, 'r', encoding='utf-8') as f:
                self.data = json.load(f)
        if limit: self.data = self.data[:limit]

        self.vocab_src = None
        self.vocab_tgt = None

    def build_vocabs(self):
        self.vocab_src = Vocab(frequency_threshold=1)
        self.vocab_tgt = Vocab(frequency_threshold=1)

        src_texts = [item['english'] for item in self.data]
        tgt_texts = [item['vietnamese'] for item in self.data]

        self.vocab_src.build_vocab(src_texts)
        self.vocab_tgt.build_vocab(tgt_texts)

    def __len__(self): return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        src_encoded = self.vocab_src.encode(item['english'])[::-1]
        tgt_encoded = self.vocab_tgt.encode(item['vietnamese'])

        src_indices = [self.vocab_src.bos_idx] + src_encoded + [self.vocab_src.eos_idx]
        tgt_indices = [self.vocab_tgt.bos_idx] + tgt_encoded + [self.vocab_tgt.eos_idx]

        return torch.tensor(src_indices), torch.tensor(tgt_indices)

class MyCollate:
    def __init__(self, pad_idx): self.pad_idx = pad_idx
    def __call__(self, batch):
        src = [item[0] for item in batch]
        trg = [item[1] for item in batch]
        src = pad_sequence(src, batch_first=True, padding_value=self.pad_idx)
        trg = pad_sequence(trg, batch_first=True, padding_value=self.pad_idx)
        return src, trg

Tải data

In [None]:
train_dataset = PhoMTDataset(TRAIN_PATH, limit=20000)
dev_dataset = PhoMTDataset(DEV_PATH, limit=2000)
test_dataset = PhoMTDataset(TEST_PATH, limit=2000)

train_dataset.build_vocabs()

dev_dataset.vocab_src = train_dataset.vocab_src
dev_dataset.vocab_tgt = train_dataset.vocab_tgt

test_dataset.vocab_src = train_dataset.vocab_src
test_dataset.vocab_tgt = train_dataset.vocab_tgt

train_dataset.data.sort(key=lambda x: len(x['english'].split()))
dev_dataset.data.sort(key=lambda x: len(x['english'].split()))
test_dataset.data.sort(key=lambda x: len(x['english'].split()))

print(f"Train: {len(train_dataset)} | Dev: {len(dev_dataset)} | Test: {len(test_dataset)}")

pad_idx = train_dataset.vocab_src.pad_idx
collate_fn = MyCollate(pad_idx)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

Train: 20000 | Dev: 2000 | Test: 2000


Mô hình

In [None]:
class Seq2seq(nn.Module):
    def __init__(self, d_model: int, n_encoder: int, n_decoder: int, dropout: float, vocab_src, vocab_tgt):
        super().__init__()
        self.vocab_src = vocab_src
        self.vocab_tgt = vocab_tgt

        self.src_embedding = nn.Embedding(len(vocab_src), d_model, padding_idx=vocab_src.pad_idx)
        self.tgt_embedding = nn.Embedding(len(vocab_tgt), d_model, padding_idx=vocab_tgt.pad_idx)
        self.encoder = nn.LSTM(
            input_size=d_model, hidden_size=d_model, num_layers=n_encoder,
            batch_first=True, dropout=dropout, bidirectional=False
        )

        self.decoder = nn.LSTM(
            input_size=d_model, hidden_size=d_model, num_layers=n_decoder,
            batch_first=True, dropout=dropout, bidirectional=False
        )

        self.output_head = nn.Linear(d_model, len(vocab_tgt))
        self.loss = nn.CrossEntropyLoss(ignore_index=vocab_tgt.pad_idx)

    def forward_step(self, input_ids, hidden_state, cell_state):
        embedded_input = self.tgt_embedding(input_ids)
        output, (hidden_state, cell_state) = self.decoder(embedded_input, (hidden_state, cell_state))
        return output, hidden_state, cell_state

    def forward(self, x, y):
        embedded_x = self.src_embedding(x)
        _, (hidden, cell) = self.encoder(embedded_x)

        bs, tgt_len = y.shape
        logits = []
        curr_input = y[:, 0].unsqueeze(-1)

        for ith in range(tgt_len - 1):
            output, hidden, cell = self.forward_step(curr_input, hidden, cell)
            logit = self.output_head(output.squeeze(1))
            logits.append(logit.unsqueeze(1))
            curr_input = y[:, ith + 1].unsqueeze(-1)

        logits = torch.cat(logits, dim=1)
        return self.loss(logits.reshape(-1, len(self.vocab_tgt)), y[:, 1:].reshape(-1))

Tạo Model

In [None]:
model = Seq2seq(
    d_model=HIDDEN_SIZE,
    n_encoder=N_LAYERS,
    n_decoder=N_LAYERS,
    dropout=DROPOUT,
    vocab_src=train_dataset.vocab_src,
    vocab_tgt=train_dataset.vocab_tgt
).to(DEVICE)

def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name and param.dim() > 1:
            nn.init.xavier_uniform_(param.data)
        elif 'bias' in name:
            nn.init.constant_(param.data, 0)

model.apply(init_weights)
optimizer = optim.Adam(model.parameters(), lr=LR)

Training

In [None]:
def train_epoch(model, loader, optimizer):
    model.train()
    total_loss = 0

    for src, tgt in loader:
        src, tgt = src.to(DEVICE), tgt.to(DEVICE)

        optimizer.zero_grad()
        loss = model(src, tgt)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(loader)

print(f"Huấn luyện {N_EPOCHS} epochs...")

for epoch in range(N_EPOCHS):
    start_time = time.time()

    train_loss = train_epoch(model, train_loader, optimizer)

    end_time = time.time()
    print(f"Epoch {epoch+1:02} | Time: {end_time - start_time:.0f}s | Train Loss: {train_loss:.4f}")

Huấn luyện 30 epochs...
Epoch 01 | Time: 31s | Train Loss: 6.1457
Epoch 02 | Time: 27s | Train Loss: 5.8677
Epoch 03 | Time: 27s | Train Loss: 5.5175
Epoch 04 | Time: 28s | Train Loss: 5.2956
Epoch 05 | Time: 27s | Train Loss: 5.0990
Epoch 06 | Time: 29s | Train Loss: 4.9428
Epoch 07 | Time: 27s | Train Loss: 4.8067
Epoch 08 | Time: 27s | Train Loss: 4.6698
Epoch 09 | Time: 27s | Train Loss: 4.5487
Epoch 10 | Time: 27s | Train Loss: 4.4458
Epoch 11 | Time: 27s | Train Loss: 4.3513
Epoch 12 | Time: 26s | Train Loss: 4.2681
Epoch 13 | Time: 26s | Train Loss: 4.1900
Epoch 14 | Time: 26s | Train Loss: 4.1203
Epoch 15 | Time: 26s | Train Loss: 4.0539
Epoch 16 | Time: 27s | Train Loss: 3.9954
Epoch 17 | Time: 26s | Train Loss: 3.9401
Epoch 18 | Time: 27s | Train Loss: 3.8883
Epoch 19 | Time: 26s | Train Loss: 3.8371
Epoch 20 | Time: 26s | Train Loss: 3.7872
Epoch 21 | Time: 27s | Train Loss: 3.7437
Epoch 22 | Time: 27s | Train Loss: 3.7003
Epoch 23 | Time: 26s | Train Loss: 3.6596
Epoch 24 |

Đánh giá

In [None]:
rouge = evaluate.load("rouge")

def calculate_rouge(model, loader, dataset):
    model.eval()
    preds = []
    refs = []

    print("Tính toán ROUGE-L trên tập Test")

    with torch.no_grad():
        for src, tgt in loader:
            src = src.to(DEVICE)

            embedded_x = model.src_embedding(src)
            _, (hidden, cell) = model.encoder(embedded_x)

            bs = src.shape[0]
            curr_input = torch.full((bs, 1), dataset.vocab_tgt.bos_idx, dtype=torch.long, device=DEVICE)

            batch_preds = []

            for _ in range(50):
                output, hidden, cell = model.forward_step(curr_input, hidden, cell)

                logit = model.output_head(output.squeeze(1))
                pred_idx = logit.argmax(dim=-1)

                batch_preds.append(pred_idx.unsqueeze(1))
                curr_input = pred_idx.unsqueeze(1)

            batch_preds = torch.cat(batch_preds, dim=1)

            for i in range(bs):
                pred_tokens = []
                for idx in batch_preds[i]:
                    if idx == dataset.vocab_tgt.eos_idx: break
                    pred_tokens.append(dataset.vocab_tgt.itos[idx.item()])

                tgt_tokens = []
                for idx in tgt[i]:
                    if idx == dataset.vocab_tgt.eos_idx: break
                    if idx not in [dataset.vocab_tgt.bos_idx, dataset.vocab_tgt.pad_idx]:
                        tgt_tokens.append(dataset.vocab_tgt.itos[idx.item()])

                preds.append(" ".join(pred_tokens))
                refs.append(" ".join(tgt_tokens))

    results = rouge.compute(predictions=preds, references=refs)
    return results

scores = calculate_rouge(model, test_loader, test_dataset)
print(f"KẾT QUẢ BÀI 1 (ROUGE-L): {scores['rougeL']:.4f}")

Tính toán ROUGE-L trên tập Test
KẾT QUẢ BÀI 1 (ROUGE-L): 0.3355
