In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import re
import random
from tqdm import tqdm

In [2]:
from datasets import load_dataset
dataset = load_dataset('mt_eng_vietnamese', 'iwslt2015-en-vi')

Downloading builder script:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading and preparing dataset mt_eng_vietnamese/iwslt2015-en-vi (download: 30.83 MiB, generated: 31.59 MiB, post-processed: Unknown size, total: 62.42 MiB) to /root/.cache/huggingface/datasets/mt_eng_vietnamese/iwslt2015-en-vi/1.0.0/53add551a01e9874588066f89d42925f9fad43db347199dad00f7e4b0c905a71...


Downloading data:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/18.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/140k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/188k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/132k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/184k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/133318 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1269 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1269 [00:00<?, ? examples/s]

Dataset mt_eng_vietnamese downloaded and prepared to /root/.cache/huggingface/datasets/mt_eng_vietnamese/iwslt2015-en-vi/1.0.0/53add551a01e9874588066f89d42925f9fad43db347199dad00f7e4b0c905a71. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
train_data = dataset['train']
valid_data = dataset['validation']
test_data = dataset['test']

In [4]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, Dataset


tokenizer_en = AutoTokenizer.from_pretrained('bert-base-cased')
tokenizer_vi = AutoTokenizer.from_pretrained('vinai/phobert-base')

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [5]:
en_train_data = [sample['translation']['en'] for sample in train_data]
vi_train_data = [sample['translation']['vi'] for sample in train_data]

en_val_data = [sample['translation']['en'] for sample in valid_data]
vi_val_data = [sample['translation']['vi'] for sample in valid_data]


In [6]:
class CustomDataset(Dataset):
    def __init__(self, source, target, tokenizer_en, tokenizer_vi, max_length=512):
        self.source = source
        self.target = target
        self.tokenizer_en = tokenizer_en
        self.tokenizer_vi = tokenizer_vi
        self.max_length = max_length

    def __len__(self):
        return len(self.source)

    def __getitem__(self, idx):
        en = self.source[idx]
        vi = self.target[idx]
            
        encoding_en = self.tokenizer_en(
            en,
            max_length=self.max_length,
            add_special_tokens = True,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        
        encoding_vi = self.tokenizer_vi(
            vi,
            max_length=self.max_length,
            add_special_tokens = True,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        return {
            'input_ids_en': encoding_en['input_ids'].squeeze(),
            'attention_mask_en': encoding_en['input_ids'].squeeze(),
            'input_ids_vi': encoding_vi['input_ids'].squeeze(),
            'attention_mask_vi': encoding_vi['input_ids'].squeeze(),
        } 

In [7]:
batch_size = 32
train_dataset = CustomDataset(en_train_data, vi_train_data, tokenizer_en, tokenizer_vi, max_length=128)
val_dataset = CustomDataset(en_val_data, vi_val_data, tokenizer_en, tokenizer_vi, max_length=128)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [8]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        # input_size: Kích thước của từ điển đầu vào (số lượng từ trong từ điển).
        # embedding_size: Kích thước của vector nhúng (embedding) cho mỗi từ.
        # hidden_size: Kích thước của các hidden state trong LSTM.
        # num_layers: Số lượng layer trong LSTM.
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p, batch_first=True)
        #  dữ liệu đầu vào sẽ có kích thước là (batch_size, seq_length, input_size)
        #  batch_size: Số lượng các chuỗi trong mỗi batch.
        #  seq_length: Độ dài của mỗi chuỗi.
        #  input_size: Số chiều của mỗi phần tử trong chuỗi (ví dụ: kích thước vector nhúng).
        
    def forward(self, x):
        embedding = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.rnn(embedding)
        
        return hidden, cell
    


In [9]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hidden, cell):
        # shape of x: (N) nhưng cần (N, 1)
        
        x = x.unsqueeze(1)
        
        embedding = self.dropout(self.embedding(x))
        # embedding shape: (N, 1, embedding_size)
        
        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # shape of outputs: (N, 1, hidden_size)
        
        predictions = self.fc(outputs)
        # shape of predictions: (N, 1, length_of_vocab)
        
        predictions = predictions.squeeze(1)
        
        return predictions, hidden, cell
        

In [10]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, source, target, teacher_force_ratio=0.5):
        # source shape: (batch_size, seq_length)
        # target shape: (batch_size, seq_length)
        batch_size = source.shape[0]
        target_len = target.shape[1] 
        target_vocab_size = len(tokenizer_vi)
        
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(device)
        
        hidden, cell = self.encoder(source)
        
        x = target[:, 0] 
        
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            
            outputs[:, t] = output
            
            best_guess = output.argmax(1)
            
            x = target[:, t] if random.random() < teacher_force_ratio else best_guess
            
        return outputs
    
    def generate(self, source, max_length=128):
        # source shape: (batch_size, seq_length)
        hidden, cell = self.encoder(source)
        batch_size = source.shape[0]
        
        outputs = torch.zeros(batch_size, max_length, len(tokenizer_vi)).to(device)

        start_token = torch.tensor([tokenizer_vi.cls_token_id] * batch_size).to(device)
        
        for t in range(max_length):
            output, hidden, cell = self.decoder(
                start_token, hidden, cell
            )
            
            # output shape: (batch_size, vocab_size)

            # Save the output at each time step
            outputs[:, t, :] = output

            # Get the predicted token for the next step
            predicted_token = torch.argmax(output, dim=1)
            
            # Set the predicted token as the input for the next step
            start_token = predicted_token.view(-1)
            
            # Check if the predicted token is the sep token
            if predicted_token.item() == tokenizer_vi.sep_token_id:
                break 

        return torch.argmax(outputs, dim=-1)
        

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_encoder = len(tokenizer_en)
input_size_decoder = len(tokenizer_vi)
output_size = len(tokenizer_vi)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5
num_epochs = 3
learning_rate = 0.001

In [12]:
encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device)
decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout).to(device)
model = Seq2Seq(encoder_net, decoder_net).to(device)

In [13]:
print(len(train_dataset[0]['input_ids_en']), len(train_dataset[0]['input_ids_vi']) )

128 128


In [14]:
input_ids_en = train_dataset[0]['input_ids_en'].unsqueeze(0).to(device)
input_ids_vi = train_dataset[0]['input_ids_vi'].unsqueeze(0).to(device)

outputs = model(input_ids_en, input_ids_vi)


In [15]:
outputs

tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0434,  0.0336,  0.0311,  ..., -0.0263,  0.0399,  0.0106],
         [ 0.0477,  0.0400,  0.0241,  ..., -0.0044,  0.0236,  0.0081],
         ...,
         [-0.0028,  0.0091,  0.0629,  ..., -0.0063,  0.0239,  0.0330],
         [ 0.0098,  0.0192,  0.0569,  ...,  0.0011, -0.0013,  0.0329],
         [ 0.0031,  0.0121,  0.0533,  ..., -0.0097,  0.0107,  0.0527]]],
       device='cuda:0', grad_fn=<CopySlices>)

In [16]:
outputs.shape

torch.Size([1, 128, 64001])

In [17]:
outputs = model.generate(input_ids_en)
predicted_words = [tokenizer_vi.convert_ids_to_tokens(sentence) for sentence in outputs]

In [18]:
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer_vi.pad_token_id)
optim = optim.Adam(model.parameters(), lr=learning_rate)

In [19]:
def train_step(model, optimizer, loader):
    model.train()
    running_loss = 0.0
    bar = tqdm(enumerate(loader), unit='batch', total=len(loader))
    for i, batch in bar:
        input_ids_en = batch["input_ids_en"].to(device)
        input_ids_vi = batch["input_ids_vi"].to(device)
        
        pred = model(input_ids_en, input_ids_vi)
        # pred shape: (batch_size, seq_length, vocab_size)
        # target shape: (batch_size, seq_length)
        
        reshaped_pred = pred.view(-1, pred.size(2))
        target = input_ids_vi.view(-1)
        
        loss = criterion(reshaped_pred, target)
        running_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 20 == 0:
            bar.set_postfix(loss=loss.item())
    return running_loss / len(loader)

def val_step(model, loader):
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for i, batch in tqdm(enumerate(loader), unit="batch", total=len(loader)):
            input_ids_en = batch["input_ids_en"].to(device)
            input_ids_vi = batch["input_ids_vi"].to(device)

            pred = model(input_ids_en, input_ids_vi)
            
            reshaped_pred = pred.view(-1, pred.size(2))
            target = input_ids_vi.view(-1)
        
            loss = criterion(reshaped_pred, target)
            running_loss += loss.item()

    return running_loss / len(loader)

In [20]:
def training_loop(num_epochs, model, optimizer, train_loader, val_loader):
    for i in range(num_epochs):
        print(f"Start epoch {i}/{num_epochs}")
        train_loss = train_step(model, optimizer, train_loader)
        val_loss = val_step(model, val_loader)
        print(f"End epoch {i}/{num_epochs}")
        print(f"Train loss {train_loss}| Val loss {val_loss}")
        print("-"*50)

In [21]:
training_loop(num_epochs, model, optim, train_dataloader, val_dataloader)

Start epoch 0/3


100%|██████████| 4167/4167 [3:55:22<00:00,  3.39s/batch, loss=5.61]
100%|██████████| 40/40 [00:41<00:00,  1.04s/batch]


End epoch 0/3
Train loss 5.996194736832248| Val loss 5.547330045700074
--------------------------------------------------
Start epoch 1/3


100%|██████████| 4167/4167 [3:55:20<00:00,  3.39s/batch, loss=5.09]
100%|██████████| 40/40 [00:41<00:00,  1.04s/batch]


End epoch 1/3
Train loss 5.38728171571256| Val loss 5.287942028045654
--------------------------------------------------
Start epoch 2/3


100%|██████████| 4167/4167 [3:55:44<00:00,  3.39s/batch, loss=5.42]
100%|██████████| 40/40 [00:41<00:00,  1.04s/batch]

End epoch 2/3
Train loss 5.18315731263258| Val loss 5.140177595615387
--------------------------------------------------





In [None]:
torch.save(model.state_dict(), 'path/to/save/model.pth')