In [1]:
import torch
import torch.nn as nn
import re
import random
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, EncoderDecoderModel
from torch.utils.data import DataLoader, Dataset

EN_MODEL = "bert-base-cased"
VN_MODEL = "vinai/phobert-base-v2"

In [2]:
dataset = load_dataset('mt_eng_vietnamese', 'iwslt2015-en-vi')

Downloading builder script:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading and preparing dataset mt_eng_vietnamese/iwslt2015-en-vi (download: 30.83 MiB, generated: 31.59 MiB, post-processed: Unknown size, total: 62.42 MiB) to /root/.cache/huggingface/datasets/mt_eng_vietnamese/iwslt2015-en-vi/1.0.0/53add551a01e9874588066f89d42925f9fad43db347199dad00f7e4b0c905a71...


Downloading data:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/18.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/140k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/188k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/132k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/184k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/133318 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1269 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1269 [00:00<?, ? examples/s]

Dataset mt_eng_vietnamese downloaded and prepared to /root/.cache/huggingface/datasets/mt_eng_vietnamese/iwslt2015-en-vi/1.0.0/53add551a01e9874588066f89d42925f9fad43db347199dad00f7e4b0c905a71. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
train_data = dataset['train']
valid_data = dataset['validation']
test_data = dataset['test']

In [4]:
tokenizer_en = AutoTokenizer.from_pretrained(EN_MODEL)
tokenizer_vi = AutoTokenizer.from_pretrained(VN_MODEL)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [6]:
en_train_data = [sample['translation']['en'] for sample in train_data]
vi_train_data = [sample['translation']['vi'] for sample in train_data]

en_val_data = [sample['translation']['en'] for sample in valid_data]
vi_val_data = [sample['translation']['vi'] for sample in valid_data]


In [7]:
class CustomDataset(Dataset):
    def __init__(self, source, target, tokenizer_en, tokenizer_vi, max_length=512):
        self.source = source
        self.target = target
        self.tokenizer_en = tokenizer_en
        self.tokenizer_vi = tokenizer_vi
        self.max_length = max_length

    def __len__(self):
        return len(self.source)

    def __getitem__(self, idx):
        en = self.source[idx]
        vi = self.target[idx]
            
        encoding_en = self.tokenizer_en(
            en,
            max_length=self.max_length,
            add_special_tokens = True,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        
        encoding_vi = self.tokenizer_vi(
            vi,
            max_length=self.max_length,
            add_special_tokens = True,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        return {
            'input_ids_en': encoding_en['input_ids'].squeeze(),
            'attention_mask_en': encoding_en['input_ids'].squeeze(),
            'input_ids_vi': encoding_vi['input_ids'].squeeze(),
            'attention_mask_vi': encoding_vi['input_ids'].squeeze(),
        } 

In [8]:
# Example usage:
batch_size = 32
train_dataset = CustomDataset(en_train_data, vi_train_data, tokenizer_en, tokenizer_vi, max_length=128)
val_dataset = CustomDataset(en_val_data, vi_val_data, tokenizer_en, tokenizer_vi, max_length=128)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
print(train_dataset[0])

{'input_ids_en': tensor([  101,  4858, 14997,   131,  1109,  2598,  1481,   170,  4530, 21188,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     

In [9]:
def build_model():
    model = EncoderDecoderModel.from_encoder_decoder_pretrained(EN_MODEL, VN_MODEL)
    model.config.decoder_start_token_id = tokenizer_vi.bos_token_id
    model.config.pad_token_id = tokenizer_vi.pad_token_id
    return model


In [18]:
def translate_en_to_vi(sentence: str, model):
    input_ids = tokenizer_en([sentence], return_tensors="pt").input_ids
    model.eval()
    with torch.no_grad():
        output = model.generate(input_ids)
    output = tokenizer_vi.decode(output[0][1:])
    return output

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
def train_step(model, optimizer, loader):
    model.train()
    running_loss = 0.0
    bar = tqdm(enumerate(loader), unit="batch", total=len(loader))
    for i, batch in bar:
        vn_input_ids = batch["input_ids_vi"].to(device)
        en_input_ids = batch["input_ids_en"].to(device)
        vn_mask = batch["attention_mask_vi"].to(device)
        en_mask = batch["attention_mask_en"].to(device)
        labels = vn_input_ids.masked_fill(vn_mask == 0, -100)

        outputs = model(input_ids=en_input_ids, attention_mask=en_mask, labels=labels)
        loss = outputs.loss
        running_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 20 == 0:
            bar.set_postfix(loss=loss.item())



    return running_loss / len(loader)

def val_step(model, loader):
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for i, batch in tqdm(enumerate(loader), unit="batch", total=len(loader)):
            vn_input_ids = batch["input_ids_vi"].to(device)
            en_input_ids = batch["input_ids_en"].to(device)
            vn_mask = batch["attention_mask_vi"].to(device)
            en_mask = batch["attention_mask_en"].to(device)
            labels = vn_input_ids.masked_fill(vn_mask == 0, -100)

            outputs = model(input_ids=en_input_ids, attention_mask=en_mask, labels=labels)
            loss = outputs.loss
            running_loss += loss.item()

    return running_loss / len(loader)


In [13]:
def training_loop(num_epochs, model, optimizer, train_loader, val_loader):
    for i in range(num_epochs):
        print(f"Start epoch {i}/{num_epochs}")
        train_loss = train_step(model, optimizer, train_loader)
        val_loss = val_step(model, val_loader)
        print(f"End epoch {i}/{num_epochs}")
        print(f"Train loss {train_loss}| Val loss {val_loss}")
        print("-"*50)

In [None]:
model = build_model().to(device)
num_epochs = 5
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

In [None]:
training_loop(num_epochs, model, optimizer, train_dataloader, val_dataloader)

In [None]:
translate_en_to_vi('My name is', model)

In [None]:
model.save_pretrained("mnt")