Cài đặt thư viện

In [1]:
!pip install datasets evaluate sacrebleu rouge_score
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import random
import json
import os
import re
import time
import evaluate
from collections import Counter
from typing import Tuple



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Cấu hình

In [2]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

TRAIN_PATH = 'small-train.json'
DEV_PATH   = 'small-dev.json'
TEST_PATH  = 'small-test.json'

HIDDEN_SIZE = 256
N_LAYERS    = 3
DROPOUT     = 0.5
BATCH_SIZE  = 32
LR          = 1e-3
N_EPOCHS    = 30

Xử lý data

In [3]:
class Vocab:
    def __init__(self, frequency_threshold=1):
        self.itos = {0: "<pad>", 1: "<bos>", 2: "<eos>", 3: "<unk>"}
        self.stoi = {"<pad>": 0, "<bos>": 1, "<eos>": 2, "<unk>": 3}
        self.freq_threshold = frequency_threshold
        self.pad_idx = 0
        self.bos_idx = 1
        self.eos_idx = 2
        self.unk_idx = 3

    def __len__(self): return len(self.itos)

    @staticmethod
    def tokenize(text):
        return re.findall(r"\w+|[^\w\s]", text.lower(), re.UNICODE)

    def build_vocab(self, sentence_list):
        frequencies = Counter()
        idx = 4
        for sentence in sentence_list:
            for word in self.tokenize(sentence):
                frequencies[word] += 1
        for word, count in frequencies.items():
            if count >= self.freq_threshold:
                self.stoi[word] = idx
                self.itos[idx] = word
                idx += 1
        return len(self.itos)

    def encode(self, text):
        return [self.stoi.get(token, self.unk_idx) for token in self.tokenize(text)]

class VocabConfig:
    def __init__(self, src_vocab, tgt_vocab):
        self.total_src_tokens = len(src_vocab)
        self.total_tgt_tokens = len(tgt_vocab)
        self.pad_idx = src_vocab.pad_idx
        self.bos_idx = src_vocab.bos_idx
        self.eos_idx = src_vocab.eos_idx
        self.unk_idx = src_vocab.unk_idx

class PhoMTDataset(Dataset):
    def __init__(self, json_file, limit=None):
        self.data = []
        with open(json_file, 'r', encoding='utf-8') as f:
            self.data = json.load(f)
        if limit: self.data = self.data[:limit]
        self.vocab_src = None
        self.vocab_tgt = None

    def build_vocabs(self):
        self.vocab_src = Vocab(1)
        self.vocab_tgt = Vocab(1)
        src_texts = [item['english'] for item in self.data]
        tgt_texts = [item['vietnamese'] for item in self.data]
        self.vocab_src.build_vocab(src_texts)
        self.vocab_tgt.build_vocab(tgt_texts)

    def __len__(self): return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        src_encoded = self.vocab_src.encode(item['english'])[::-1]
        tgt_encoded = self.vocab_tgt.encode(item['vietnamese'])
        src_indices = [self.vocab_src.bos_idx] + src_encoded + [self.vocab_src.eos_idx]
        tgt_indices = [self.vocab_tgt.bos_idx] + tgt_encoded + [self.vocab_tgt.eos_idx]

        return torch.tensor(src_indices), torch.tensor(tgt_indices)

class MyCollate:
    def __init__(self, pad_idx): self.pad_idx = pad_idx
    def __call__(self, batch):
        src = [item[0] for item in batch]
        trg = [item[1] for item in batch]
        src = pad_sequence(src, batch_first=True, padding_value=self.pad_idx)
        trg = pad_sequence(trg, batch_first=True, padding_value=self.pad_idx)
        return src, trg

Tải data

In [4]:
train_dataset = PhoMTDataset(TRAIN_PATH, limit=20000)
dev_dataset = PhoMTDataset(DEV_PATH, limit=2000)
test_dataset = PhoMTDataset(TEST_PATH, limit=2000)

train_dataset.build_vocabs()

dev_dataset.vocab_src = train_dataset.vocab_src
dev_dataset.vocab_tgt = train_dataset.vocab_tgt
test_dataset.vocab_src = train_dataset.vocab_src
test_dataset.vocab_tgt = train_dataset.vocab_tgt

train_dataset.data.sort(key=lambda x: len(x['english'].split()))
dev_dataset.data.sort(key=lambda x: len(x['english'].split()))
test_dataset.data.sort(key=lambda x: len(x['english'].split()))

print(f"Train: {len(train_dataset)} | Dev: {len(dev_dataset)} | Test: {len(test_dataset)}")

pad_idx = train_dataset.vocab_src.pad_idx
collate_fn = MyCollate(pad_idx)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

vocab_config = VocabConfig(train_dataset.vocab_src, train_dataset.vocab_tgt)

Train: 20000 | Dev: 2000 | Test: 2000


Mô hình

In [5]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc_hidden = nn.Linear(hid_dim * 2, hid_dim)
        self.fc_cell = nn.Linear(hid_dim * 2, hid_dim)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))

        outputs, (hidden, cell) = self.rnn(embedded)

        n_layers_2, batch, hid = hidden.shape
        n_layers = n_layers_2 // 2

        hidden = hidden.view(n_layers, 2, batch, hid)
        cell = cell.view(n_layers, 2, batch, hid)

        hidden = torch.cat((hidden[:, 0, :, :], hidden[:, 1, :, :]), dim=2)
        cell = torch.cat((cell[:, 0, :, :], cell[:, 1, :, :]), dim=2)

        hidden = torch.tanh(self.fc_hidden(hidden))
        cell = torch.tanh(self.fc_cell(cell))

        return outputs, hidden, cell

class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear((hid_dim * 2) + hid_dim, hid_dim)
        self.v = nn.Linear(hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.shape[1]

        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)

        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))

        attention = self.v(energy).squeeze(2)

        return torch.softmax(attention, dim=1)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM((hid_dim * 2) + emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear((hid_dim * 2) + hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))

        a = self.attention(hidden[-1], encoder_outputs)
        a = a.unsqueeze(1)

        weighted = torch.bmm(a, encoder_outputs)

        rnn_input = torch.cat((embedded, weighted), dim=2)

        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))

        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=2))
        prediction = prediction.squeeze(1)

        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        encoder_outputs, hidden, cell = self.encoder(src)

        input = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t] = output

            top1 = output.argmax(1)
            teacher_force = random.random() < teacher_forcing_ratio
            input = trg[:, t] if teacher_force else top1

        return outputs

    def predict(self, src, max_len=50, bos_idx=1, eos_idx=2):
        self.eval()
        with torch.no_grad():
            batch_size = src.shape[0]
            encoder_outputs, hidden, cell = self.encoder(src)
            input = torch.tensor([bos_idx] * batch_size).to(self.device)
            outputs = []

            for _ in range(max_len):
                output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
                pred_token = output.argmax(1)
                outputs.append(pred_token.unsqueeze(1))
                input = pred_token

            return torch.cat(outputs, dim=1)

Tạo Model

In [6]:
attn = Attention(HIDDEN_SIZE)

enc = Encoder(vocab_config.total_src_tokens, HIDDEN_SIZE, HIDDEN_SIZE, N_LAYERS, DROPOUT)

dec = Decoder(vocab_config.total_tgt_tokens, HIDDEN_SIZE, HIDDEN_SIZE, N_LAYERS, DROPOUT, attn)

model = Seq2Seq(enc, dec, DEVICE).to(DEVICE)

def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name and param.dim() > 1:
            nn.init.xavier_uniform_(param.data)
        elif 'bias' in name:
            nn.init.constant_(param.data, 0)

model.apply(init_weights)
optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=vocab_config.pad_idx)

Traning

In [7]:
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for src, tgt in loader:
        src, tgt = src.to(DEVICE), tgt.to(DEVICE)

        optimizer.zero_grad()
        output = model(src, tgt)

        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        tgt = tgt[:, 1:].reshape(-1)

        loss = criterion(output, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

print(f"Huấn luyện {N_EPOCHS} epochs")

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    end_time = time.time()
    print(f"Epoch {epoch+1:02} | Time: {end_time - start_time:.0f}s | Train Loss: {train_loss:.4f}")

Huấn luyện 30 epochs
Epoch 01 | Time: 93s | Train Loss: 5.8139
Epoch 02 | Time: 92s | Train Loss: 5.2471
Epoch 03 | Time: 92s | Train Loss: 4.8531
Epoch 04 | Time: 93s | Train Loss: 4.5345
Epoch 05 | Time: 91s | Train Loss: 4.3053
Epoch 06 | Time: 90s | Train Loss: 4.1069
Epoch 07 | Time: 90s | Train Loss: 3.9250
Epoch 08 | Time: 90s | Train Loss: 3.7923
Epoch 09 | Time: 88s | Train Loss: 3.6353
Epoch 10 | Time: 89s | Train Loss: 3.5035
Epoch 11 | Time: 89s | Train Loss: 3.4203
Epoch 12 | Time: 89s | Train Loss: 3.3174
Epoch 13 | Time: 89s | Train Loss: 3.2373
Epoch 14 | Time: 89s | Train Loss: 3.1606
Epoch 15 | Time: 90s | Train Loss: 3.0917
Epoch 16 | Time: 89s | Train Loss: 3.0190
Epoch 17 | Time: 90s | Train Loss: 2.9661
Epoch 18 | Time: 90s | Train Loss: 2.9153
Epoch 19 | Time: 90s | Train Loss: 2.8628
Epoch 20 | Time: 89s | Train Loss: 2.8104
Epoch 21 | Time: 89s | Train Loss: 2.7807
Epoch 22 | Time: 89s | Train Loss: 2.7437
Epoch 23 | Time: 91s | Train Loss: 2.6924
Epoch 24 | Ti

Đánh giá

In [9]:
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
rouge = evaluate.load("rouge")

def calculate_rouge(model, loader, dataset):
    model.eval()
    preds = []
    refs = []

    print("Tính toán ROUGE-L trên tập Test")

    with torch.no_grad():
        for src, tgt in tqdm(loader):
            src = src.to(DEVICE)

            batch_preds = model.predict(src)

            bs = src.shape[0]

            if batch_preds.numel() == 0:
                continue

            for i in range(bs):
                if i >= len(batch_preds): break

                pred_tokens = []
                for idx in batch_preds[i]:
                    if idx == dataset.vocab_tgt.eos_idx: break
                    pred_tokens.append(dataset.vocab_tgt.itos[idx.item()])

                tgt_tokens = []
                for idx in tgt[i]:
                    if idx == dataset.vocab_tgt.eos_idx: break
                    if idx not in [dataset.vocab_tgt.bos_idx, dataset.vocab_tgt.pad_idx]:
                        tgt_tokens.append(dataset.vocab_tgt.itos[idx.item()])

                preds.append(" ".join(pred_tokens))
                refs.append(" ".join(tgt_tokens))

    results = rouge.compute(predictions=preds, references=refs)
    return results

scores = calculate_rouge(model, test_loader, test_dataset)
print(f"KẾT QUẢ BÀI 2 (ROUGE-L): {scores['rougeL']:.4f}")

Tính toán ROUGE-L trên tập Test


100%|██████████| 63/63 [00:06<00:00,  9.38it/s]


KẾT QUẢ BÀI 2 (ROUGE-L): 0.3799
