In [None]:
import torch
import torch.nn as nn
import pandas as pd
import spacy
import torchtext
from torch.utils.data import Dataset, DataLoader
import numpy as np

from torch import optim
from torch.optim import Adam

import time
import math

import html

from collections import Counter
from string import punctuation
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
from itertools import product

In [None]:
class Cell(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        '''
        input: previous cell, input x_t
        output: current cell
        '''
        super().__init__(*args, **kwargs)


In [None]:
class LSTMCell(Cell):
    def __init__(self,input_size, hidden_size, bias, drop_prob, device):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.linear_xf = nn.Linear(input_size, hidden_size, bias)
        self.linear_hf = nn.Linear(hidden_size, hidden_size, bias)
        
        self.linear_xg = nn.Linear(input_size, hidden_size, bias)
        self.linear_hg = nn.Linear(hidden_size, hidden_size, bias)
        
        self.linear_xi = nn.Linear(input_size, hidden_size, bias)
        self.linear_hi = nn.Linear(hidden_size, hidden_size, bias)
        
        self.linear_xo = nn.Linear(input_size, hidden_size, bias)
        self.linear_ho = nn.Linear(hidden_size, hidden_size, bias)
        
        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()

        self.drop_out = nn.Dropout(drop_prob)
        self.device = device
    def forward(self, xt, pre_ct = None, pre_ht = None):# pre_ht = h_{t-1}, pre_ct = c_{t-1}
        # input size pre_ct, pre_ht: (batch_size, hidden_size)
        #            xt: (batch_size, input_size)
        # output size ct, ht: (batch_size, hidden_size)
        if pre_ht == None:
            pre_ht = torch.zeros(xt.size(0), self.hidden_size, device = self.device)
        if pre_ct == None:
            pre_ct = torch.zeros(xt.size(0), self.hidden_size, device = self.device)
        
        ft = self.sigmoid(
                self.drop_out(self.linear_hf(pre_ht)) + 
                self.drop_out(self.linear_xf(xt))
            )
        
        kt = pre_ct * ft
        
        gt = self.tanh(
            self.drop_out(self.linear_hg(pre_ht)) +
            self.drop_out(self.linear_xg(xt))
        )
        
        it = self.sigmoid(
            self.drop_out(self.linear_hi(pre_ht)) + 
            self.drop_out(self.linear_xi(xt))
        )
        
        jt = gt * it
        
        ct = jt + kt
        
        ot = self.sigmoid(
            self.drop_out(self.linear_ho(pre_ht)) + 
            self.drop_out(self.linear_xo(xt))
        )
        
        ht = ot * self.tanh(ct)
        
        return ct, ht    


In [None]:
class LayerLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, drop_prob, device, bidirectional = True, bias = True) -> None:
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        self.bias = bias
        self.device = device

        if bidirectional:
            if hidden_size % 2 != 0:
                raise ValueError("When bidirectional is True, hidden size must be even")
            else:
                self.hidden_size = int(self.hidden_size / 2)
            
            self.cellback = LSTMCell(self.input_size, self.hidden_size, self.bias, drop_prob, device)
        
        self.cell = LSTMCell(self.input_size, self.hidden_size, self.bias, drop_prob, device)

    def forward(self, X, c = None):
        '''
        X: batch_size * N * embedding_dim
        '''
        # shift right
        hidden_forward_c = [torch.zeros(X.size(0), self.hidden_size).to(self.device) for t in range(X.size(1))]
        hidden_forward_h = [torch.zeros(X.size(0), self.hidden_size).to(self.device) for t in range(X.size(1))]

        if c is None:
            c0 = torch.zeros(X.size(0), self.hidden_size).to(self.device)
            h0 = torch.zeros(X.size(0), self.hidden_size).to(self.device)
        else:
            c0 = c
            h0 = c

        for t in range(X.size(1)):
            if t == 0:
                hidden_forward_c[t], hidden_forward_h[t] = self.cell(X[:, t, :], c0, h0)
            else:
                hidden_forward_c[t], hidden_forward_h[t] = self.cell(
                    X[:, t, :], hidden_forward_c[t-1], hidden_forward_h[t-1]
                )
        
        hidden_backward_c = [torch.zeros(X.size(0), self.hidden_size).to(self.device) for t in range(X.size(1))]
        hidden_backward_h = [torch.zeros(X.size(0), self.hidden_size).to(self.device) for t in range(X.size(1))]

        if self.bidirectional:
            for t in range(-1, -(X.size(1) + 1), -1):
                if t == 0:
                    hidden_backward_c[t], hidden_backward_h[t] = self.cellback(X[:, t, :], c0, h0)
                else:
                    hidden_backward_c[t], hidden_backward_h[t] = self.cellback(
                        X[:, t, :], hidden_backward_c[t+1], hidden_backward_h[t+1]
                    )

            hidden_forward = torch.stack(hidden_forward_h, dim = 1)
            hidden_backward = torch.stack(hidden_backward_h, dim = 1)

            hidden_h = torch.concatenate((hidden_forward, hidden_backward), -1)
            
            return hidden_h

        else:
            hidden_h = torch.stack(hidden_forward_h, dim = 1)
            return hidden_h
        

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers, drop_prob, vocab_size, device, bidirectional = True, bias = True) -> None:
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.bidirectional = bidirectional
        self.embedd = nn.Embedding(vocab_size, input_size)
        self.device = device
        
        self.D = 2 if bidirectional else 1

        self.layer_0 = LayerLSTM(self.input_size, self.hidden_size, drop_prob, device,bidirectional, bias)

        self.layers = nn.ModuleList(
            [LayerLSTM(
                self.hidden_size, self.hidden_size, drop_prob, device ,bidirectional, bias
            ) for i in range(self.n_layers - 1)]
        )

    def forward(self, X):
        out = self.embedd(X)
        out = self.layer_0(out)
        for layer_idx in range(self.n_layers - 1):
            out = self.layers[layer_idx](out)
        
        out_forward = out[:, -1, :]

        return out_forward
        

In [None]:
class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers, drop_prob, vocab_size, deivce,bias = True) -> None:
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.embedd = nn.Embedding(vocab_size, input_size)
        self.device = device

        self.layer_0 = LayerLSTM(self.input_size, self.hidden_size, drop_prob, device,False, bias)

        self.layers = nn.ModuleList(
            [LayerLSTM(
                self.hidden_size, self.hidden_size, drop_prob, device,False, bias
            ) for i in range(self.n_layers - 1)]
        )
        self.output = nn.Linear(self.hidden_size, vocab_size, bias)
        self.drop_out = nn.Dropout(drop_prob)

    def forward(self, X, c):
        out = self.embedd(X)
        out = self.layer_0(out, c)
        for layer_idx in range(self.n_layers - 1):
            out = self.layers[layer_idx](out, c)

        output = self.drop_out(self.output(out))

        return output
        

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers, drop_prob, en_vocab_size, de_vocab_size, device,bias = True):
        super().__init__()
        self.encoder = Encoder(input_size, hidden_size, n_layers, drop_prob, en_vocab_size, device,True, bias)
        self.decoder = Decoder(input_size, hidden_size, n_layers, drop_prob, de_vocab_size, device, bias)

    def forward(self, src, trg):
        enc = self.encoder(src)
        dec = self.decoder(trg, enc)
        return dec
            

In [None]:

# GPU device setting
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# model parameter setting

# optimizer parameter setting
init_lr = 1e-5
factor = 0.9
adam_eps = 5e-9
patience = 10
warmup = 100
epoch = 500
clip = 1.0
weight_decay = 5e-4
inf = float('inf')

In [None]:
def choice_sample_p(data, size):
    np.random.seed(2206)
    sample_p_idx = np.random.choice(np.arange(len(data)), size = size, replace= False)
    sample_p_choice = np.isin(np.arange(len(data)), sample_p_idx)
    sample_p_data = np.array(data)[sample_p_choice]
    return sample_p_data

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def get_bleu(hypotheses, reference):
    """Get validation BLEU score for dev set."""
    smoothie = SmoothingFunction().method1
    bleu_score = sentence_bleu([reference], hypotheses, smoothing_function=smoothie)*100
    return bleu_score

def idx_to_word(x, vocab):
    words = []
    for i in x:
        word = vocab.get_itos()[i]
        if '<' not in word:
            words.append(word)
    words = " ".join(words)
    return words

In [None]:
eos_token = "<eos>"
sos_token = "<sos>"
pad_token = "<pad>"
unk_token = "<unk>"
max_length = 40
min_freq = 2
batch_size = 128

# đọc file trainen, trainvi, tạo thành các list
def load_file(file_path):
    sentences = []
    with open(file_path, 'r') as file:
        content = file.read()
        sentences = content.split('\n')
    new_sentences = []
    for sentence in sentences:
        tokens = []
        for token in sentence.split():
            if token not in punctuation:
                tokens.append(token)
        new_sentence = ' '.join(tokens)
        new_sentences.append(new_sentence)
    return new_sentences

def tokenizer(sentences, lower, eos_token, sos_token, max_length):
    return [sos_token] + html.unescape(
        sentences.lower() if lower else sentences
    ).split(' ')[:(max_length - 2)] + [eos_token]

# train
train_en_file = "/kaggle/input/data-machine-translation/data/train/train.en"
train_vi_file = "/kaggle/input/data-machine-translation/data/train/train.vi"

train_en = load_file(train_en_file)
train_vi = load_file(train_vi_file)

train_en_tokens = [tokenizer(sent,True, eos_token, sos_token, max_length) for sent in train_en]
train_vi_tokens = [tokenizer(sent,True, eos_token, sos_token, max_length) for sent in train_vi]

# test
test_en_file = "/kaggle/input/data-machine-translation/data/test/train.en"
test_vi_file = "/kaggle/input/data-machine-translation/data/test/train.vi"

test_en = load_file(test_en_file)
test_vi = load_file(test_vi_file)

test_en_tokens = [tokenizer(sent,True, eos_token, sos_token, max_length) for sent in test_en]
test_vi_tokens = [tokenizer(sent,True, eos_token, sos_token, max_length) for sent in test_vi]

#dev
dev_en_file = "/kaggle/input/data-machine-translation/data/dev/train.en"
dev_vi_file = "/kaggle/input/data-machine-translation/data/dev/train.vi"

dev_en = load_file(dev_en_file)
dev_vi = load_file(dev_vi_file)

dev_en_tokens = [tokenizer(sent,True, eos_token, sos_token, max_length) for sent in dev_en]
dev_vi_tokens = [tokenizer(sent,True, eos_token, sos_token, max_length) for sent in dev_vi]

# vocab
special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token
]
en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_en_tokens, min_freq,special_tokens,
)
vi_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_vi_tokens, min_freq, special_tokens
)

assert en_vocab[unk_token] == vi_vocab[unk_token]
assert en_vocab[pad_token] == vi_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

en_vocab.set_default_index(unk_index)
vi_vocab.set_default_index(unk_index)

en_vocab_size = len(en_vocab)
vi_vocab_size = len(vi_vocab)

#train
train_en_ids = [en_vocab.lookup_indices(toks) for toks in train_en_tokens]
train_vi_ids = [vi_vocab.lookup_indices(toks) for toks in train_vi_tokens]

#test
test_en_ids = [en_vocab.lookup_indices(toks) for toks in test_en_tokens]
test_vi_ids = [vi_vocab.lookup_indices(toks) for toks in test_vi_tokens]

#dev
dev_en_ids = [en_vocab.lookup_indices(toks) for toks in dev_en_tokens]
dev_vi_ids = [vi_vocab.lookup_indices(toks) for toks in dev_vi_tokens]

class Bitext(Dataset):
    def __init__(self, src, trg) -> None:
        super().__init__()
        self.src = src
        self.trg = trg
    def __len__(self):
        return len(self.src)
    def __getitem__(self, index):
        return self.src[index], self.trg[index]
    
train_dataset = Bitext(train_en_ids, train_vi_ids)
test_dataset = Bitext(test_en_ids, test_vi_ids)
dev_dataset = Bitext(dev_en_ids, dev_vi_ids)


In [None]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        src, trg = zip(*batch)

        src = [torch.LongTensor(toks) for toks in src]
        trg = [torch.LongTensor(toks) for toks in trg]
        
        src = nn.utils.rnn.pad_sequence(src, batch_first=True, padding_value=pad_index)
        trg = nn.utils.rnn.pad_sequence(trg, batch_first=True, padding_value=pad_index)

        return src, trg

    return collate_fn

def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle
    )
    return data_loader

train_dataloader = get_data_loader(
    train_dataset, batch_size, pad_index, True
)

test_dataloader = get_data_loader(
    test_dataset, batch_size, pad_index, True
)

dev_dataloader = get_data_loader(
    dev_dataset, batch_size, pad_index, True
)



In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.kaiming_uniform(m.weight.data)

In [None]:
input_size = 128
hidden_size = 256
n_layers = 2 #sửa lại: 4
drop_prob = 0.1
model = Seq2Seq(
    input_size=input_size,
    hidden_size=hidden_size,
    n_layers=n_layers,
    drop_prob=drop_prob,
    en_vocab_size=en_vocab_size,
    de_vocab_size= vi_vocab_size,
    device = device
).to(device)
print(f'The model has {count_parameters(model):,} trainable parameters')

In [None]:
model.apply(initialize_weights)
optimizer = Adam(params=model.parameters(),
                 lr=init_lr,
                 weight_decay=weight_decay,
                 eps=adam_eps)

In [None]:
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 verbose=True,
                                                 factor=factor,
                                                 patience=patience)

criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [None]:
def train(model, iterator, optimizer, criterion, clip, device):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src, trg = batch
        src = src.to(device)
        trg = trg.to(device)

        optimizer.zero_grad()
        output = model(src, trg[:, :-1])
        output_reshape = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:, 1:].contiguous().view(-1)

        loss = criterion(output_reshape, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()
        print('step :', round((i / len(iterator)) * 100, 2), '% , loss :', loss.item())

    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion, device):
    model.eval()
    epoch_loss = 0
    batch_bleu = []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src, trg = batch
            src = src.to(device)
            trg = trg.to(device)
            
            output = model(src, trg[:, :-1])
            output_reshape = output.contiguous().view(-1, output.shape[-1])
            trg = trg[:, 1:].contiguous().view(-1)

            loss = criterion(output_reshape, trg)
            epoch_loss += loss.item()

            total_bleu = []

            src, trg = batch
            for j in range(len(trg)):
                try:
                    trg_words = idx_to_word(trg[j], vi_vocab)
                    output_words = output[j].argmax(1)
                    output_words = idx_to_word(output_words, vi_vocab)
                    ble = get_bleu(hypotheses=output_words.split(), reference=trg_words.split())
                    total_bleu.append(ble)
                except:
                    pass

            batch_bleu.append(np.mean(total_bleu))

    batch_bleu = np.mean(batch_bleu)
    return epoch_loss / len(iterator), batch_bleu

In [None]:
# from util.util import *
def run(model, total_epoch, best_loss, device):
    train_losses, test_losses, bleus = [], [], []
    for step in range(total_epoch):
        start_time = time.time()
        train_loss = train(model, train_dataloader, optimizer, criterion, clip, device)
        valid_loss, bleu = evaluate(model, dev_dataloader, criterion, device) # tính chỉ số bleu trên mỗi epoch
        end_time = time.time()

        if step > warmup:
            scheduler.step(valid_loss)

        train_losses.append(train_loss)
        test_losses.append(valid_loss)
        bleus.append(bleu)
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_loss:
            best_loss = valid_loss
#             torch.save(model.state_dict(), '/kaggle/working/model-{0}.pt'.format(valid_loss))
        if step % 20 == 0:
            torch.save(model.state_dict(), '/kaggle/working/model-{0}.pt'.format(step))

        f = open('/kaggle/working/train_loss.txt', 'w') # lưu lại train_loss để vẽ đồ thị
        f.write(str(train_losses))
        f.close()

        f = open('/kaggle/working/bleu.txt', 'w')
        f.write(str(bleus))
        f.close()

        f = open('/kaggle/working/valid_loss.txt', 'w') # lưu lại valid_loss để vẽ đồ thị
        f.write(str(test_losses))
        f.close()

        print(f'Epoch: {step + 1} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\tVal Loss: {valid_loss:.3f} |  Val PPL: {math.exp(valid_loss):7.3f}')
        print(f'\tBLEU Score: {bleu:.3f}')
    # tính loss, bleu trên tập test:
    test_loss, bleu_test = evaluate(model, test_dataloader, criterion, device)
    print(f"\tTest Loss: {test_loss:.3f} | Bleu Score of Test: {bleu_test:.3f}")
    f = open('/kaggle/working/result_test_loss.txt', 'w')
    f.write(str(test_loss))
    f.close()

    f = open('/kaggle/working/result_Bleu_Test.txt', 'w')
    f.write(str(bleu_test))
    f.close()
#     return train_losses, test_losses, bleus


In [None]:
# epoch = 2
run(model = model,total_epoch = epoch, best_loss=inf, device = device)
torch.save(model.state_dict(), '/kaggle/working/model-official.pt')

In [None]:
#dịch: 
for batch in test_dataloader:
    src, trg = batch
    break
src = src.to(device)
trg = trg.to(device)
output = model(src, trg[:, :-1])

trg_words = idx_to_word(trg[0], vi_vocab)
output_words = output[0].argmax(1)
output_words = idx_to_word(output_words, vi_vocab)
print(f"Bản gốc: {trg_words} \n Bản dịch: {output_words}")