In [None]:
import torch
import torch.nn as nn
import pandas as pd
import spacy
import torchtext
from torch.utils.data import Dataset, DataLoader
import numpy as np

from torch import optim
from torch.optim import Adam

import time
import math

import html

from collections import Counter
from string import punctuation
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
from itertools import product

# Embeddings

## Positional Embeddings

In [None]:
class PositionalEmbedding(nn.Module):
    def __init__(self, max_len, embedding_dim, device):
        super().__init__()
        
        self.d = embedding_dim

        self.embedd = torch.zeros(max_len, self.d, device = device)
        self.embedd.requires_grad = False

        pos = torch.arange(0, max_len, device = device)
        pos = pos.float().unsqueeze(dim=1)

        i_odd = torch.arange(1, self.d, step=2, device = device).float()
        i_even = torch.arange(0, self.d, step=2, device = device).float()

        self.embedd[:,0::2] = torch.sin(pos / (10000 ** (i_even / self.d)))
        self.embedd[:,1::2] = torch.cos(pos / (10000 ** ((i_odd - 1) / self.d)))

    def forward(self, X):
        '''
        Đầu vào:
        X: batch_size * N
        Đầu ra:
        một ma trận vị trí có kích thước là N * embedding_dim
        '''
        
        _, seq_len = X.size()

        out = self.embedd[:seq_len,:]

        return out # batch_size * N * d


## Tokens Embedding

In [None]:
class TokenEmbedding(nn.Embedding):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__(vocab_size, embedding_dim, padding_idx=1)

## Transformer Embedding

In [None]:
class TransformerEmbedding(nn.Module):
    def __init__(self, max_len, vocab_size, embedding_dim, drop_prob, device) -> None:
        super().__init__()
        self.tok_emb = TokenEmbedding(vocab_size, embedding_dim)
        self.pos_emb = PositionalEmbedding(max_len, embedding_dim, device)
        self.drop_out = nn.Dropout(drop_prob)
    def forward(self,X):
        tok_emb = self.tok_emb(X)
        pos_emb = self.pos_emb(X)
        return self.drop_out(tok_emb + pos_emb)


# Layers

## FeedForwardNetwork

In [None]:

class FeedForwardNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, drop_out = 0.3):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        
        self.activation = nn.ReLU()
        
        self.drop_out = nn.Dropout(drop_out)
        
        self.input_layer = nn.Linear(self.input_dim, self.hidden_dim)
        
        self.output_layer = nn.Linear(self.hidden_dim, self.output_dim)
    
    def forward(self, X):
        #X: batch_size * N * d
        
        X_in = self.drop_out(
            self.activation(
                self.input_layer(X)
            )
        )
        
        X_out = self.output_layer(
            X_in
        )
        
        return X_out

## Multihead Attention

In [None]:

class MultiheadAttention(nn.Module):
    def __init__(self,num_head, embedding_dim, dim_key, dim_value, device, masked = True):
        super().__init__()
        self.h = num_head
        self.d = embedding_dim
        self.d_k = dim_key
        self.d_v = dim_value
        self.masked = masked
        self.device = device
        
        self.hWQs = nn.ModuleList()
        for h_i in range(self.h):
            self.hWQs.append(nn.Linear(self.d, self.d_k))
        
        self.hWKs = nn.ModuleList()
        for h_i in range(self.h):
            self.hWKs.append(nn.Linear(self.d, self.d_k))
        
        self.hWVs = nn.ModuleList()
        for h_i in range(self.h):
            self.hWVs.append(nn.Linear(self.d, self.d_v))
            
        self.WO = nn.Linear(self.h * self.d_v, self.d)
            
    def forward(self, X1, X2 = None, X3 = None):
        '''
        X1,X2,X3 correspond to Q,K,V
        '''
        # X: batch_size * N * d
        if X2 is None and X3 is None:
            X2 = X1
            X3 = X1
            
        num_batchs = X1.size(0)
        
        heads = []
        for h_i in range(self.h):
            Q = self.hWQs[h_i](X1)
            # print(Q.size())
            K = self.hWKs[h_i](X2)
            V = self.hWVs[h_i](X3)
            
            softmax = nn.Softmax(dim = -1)
            if self.masked:
                mask_matrix = torch.triu(torch.ones((num_batchs, Q.size(-2), K.size(-2)), device = self.device) * -float('inf'), diagonal = 1)
            else:
                mask_matrix = torch.zeros((num_batchs, Q.size(-2), K.size(-2)), device = self.device)
            
            head_i = torch.matmul(
                softmax(
                    (torch.matmul(Q, K.transpose(-1, -2)) + mask_matrix) / 
                    torch.sqrt(torch.tensor(self.d_k))
                ),
                V
            )
            # print(head_i)
            heads.append(head_i)
        
        A = self.WO(torch.concat(heads, dim = -1))
        
        return A

## Norm

In [None]:
class Norm(nn.Module):
    def __init__(self, embedding_dim, eps = 1e-5):
        super().__init__()
        self.d = embedding_dim
        self.gamma = nn.Parameter(torch.ones(self.d))
        self.beta = nn.Parameter(torch.zeros(self.d))
        self.eps = eps
    
    def forward(self, X):
        #X: batch_size * N * d
        mu = torch.mean(X, dim = -1, keepdim = True)
        sigma = torch.sqrt(
            torch.var(X, dim = -1, unbiased = False, keepdim = True) + 
            self.eps
        )
        
        X_norm = (X - mu) / sigma
        
        out = self.gamma * X_norm + self.beta
        
        return out

# Block

## Encoder Block

In [None]:
class EncoderBlock(nn.Module):
    def __init__(self,num_head, embedding_dim, dim_key, dim_value, feed_forward_dim, device, drop_prob = 0.3):
        super().__init__()
        self.h = num_head
        self.d = embedding_dim
        self.d_ff = feed_forward_dim
        self.d_k = dim_key
        self.d_v = dim_value
        
        self.MultiheadAttention = MultiheadAttention(
            self.h, self.d, self.d_k, self.d_v, device,False
        )
        self.Norm1 = Norm(self.d)
        self.drop_out1 = nn.Dropout(p=drop_prob)

        self.FFN = FeedForwardNetwork(
            self.d,self.d_ff,self.d
        )
        self.Norm2 = Norm(self.d)
        self.drop_out2 = nn.Dropout(p=drop_prob)
        
    def forward(self, X):
        #X: batch_size * N * d
        T_1 = self.MultiheadAttention(X)
        T_1 = self.drop_out1(T_1)
        
        T_2 = X + T_1
        
        T_3 = self.Norm1(T_2)
        
        T_4 = self.FFN(T_3)
        T_4 = self.drop_out2(T_4)
        
        T_5 = T_4 + T_3
        
        H = self.Norm2(T_5)
        
        return H
        

## Decoder Block

In [None]:

class DecoderBlock(nn.Module):
    '''
    required input for DecoderBlock: pre-tokens, 
    '''
    def __init__(self,num_head, embedding_dim, dim_key, dim_value, feed_forward_dim, device, drop_prob = 0.3):
        super().__init__()
        self.h = num_head
        self.d = embedding_dim
        self.d_ff = feed_forward_dim
        self.d_k = dim_key
        self.d_v = dim_value

        self.SelfAttentionLayer = MultiheadAttention(self.h, self.d, self.d_k, self.d_v, device)
        self.Norm1 = Norm(self.d)
        self.drop_out1 = nn.Dropout(drop_prob)

        self.CrossAttentionLayer = MultiheadAttention(self.h, self.d, self.d_k, self.d_v, device, False)
        self.Norm2 = Norm(self.d)
        self.drop_out2 = nn.Dropout(drop_prob)

        self.FFN = FeedForwardNetwork(self.d, self.d_ff, self.d)
        self.Norm3 = Norm(self.d)
        self.drop_out3 = nn.Dropout(drop_prob)
        
    def forward(self, X, H_enc):
        '''
        X: batch_size * 1 * d
        '''
        if H_enc is None:
            raise ValueError("In Decoder Transformer, H_enc must be not None")

        X1 = self.SelfAttentionLayer(X)
        X1 = self.drop_out1(X1)

        X2 = X1 + X

        X3 = self.Norm1(X2)

        X4 = self.CrossAttentionLayer(X3, H_enc, H_enc)
        X4 = self.drop_out2(X4)

        X5 = X4 + X3

        X6 = self.Norm2(X5)

        X7 = self.FFN(X6)
        X7 = self.drop_out3(X7)

        X8 = X7 + X6

        out = self.Norm3(X8)

        return out
        

# Models

## Encoder

In [None]:

class Encoder(nn.Module):
    def __init__(self, num_head, embedding_dim, dim_key, dim_value, feed_forward_dim, n_blocks, vocab_size, max_len, device, drop_prob = 0.3):
        super().__init__()
        self.h = num_head
        self.d = embedding_dim
        self.d_ff = feed_forward_dim
        self.d_k = dim_key
        self.d_v = dim_value
        self.n_blocks = n_blocks
        self.emb = TransformerEmbedding(max_len, vocab_size, self.d, drop_prob, device)
        
        self.list_transformer_blocks = nn.ModuleList()
        for block_i in range(self.n_blocks):
            self.list_transformer_blocks.append(
                EncoderBlock(
                    self.h, self.d, self.d_k, self.d_v, self.d_ff, device, drop_prob
                )
            )
        
    def forward(self, X):
        #X: batch_size * N
        X_out = self.emb(X)
        for block_i in self.list_transformer_blocks:
            X_out = block_i(X_out)
            
        return X_out
        

## Decoder

In [None]:
class Decoder(nn.Module):
    def __init__(self, num_head, embedding_dim, dim_key, dim_value, feed_forward_dim, n_blocks, vocab_size, max_len, device, drop_prob = 0.3):
        super().__init__()
        self.h = num_head
        self.d = embedding_dim
        self.d_ff = feed_forward_dim
        self.d_k = dim_key
        self.d_v = dim_value
        self.n_blocks = n_blocks
        self.output_dim = vocab_size

        self.embedding = TransformerEmbedding(max_len, vocab_size, self.d, drop_prob, device)
        self.list_decoder_blocks = nn.ModuleList()
        for decoder_block_i in range(self.n_blocks):
            self.list_decoder_blocks.append(
                DecoderBlock(
                    self.h, self.d, self.d_k, self.d_v, self.d_ff, device, drop_prob
                )
            )
        
        self.linear = nn.Linear(self.d, self.output_dim)

    def forward(self, X, H_enc):
        X1 = self.embedding(X)

        for decoder_block_i in self.list_decoder_blocks:
            X1 = decoder_block_i(X1, H_enc)
        
        X2 = self.linear(X1)

        return X2

## Transformer

In [None]:
class Transformer(nn.Module):
    def __init__(
        self, num_head, embedding_dim, dim_key, dim_value, feed_forward_dim, n_blocks, enc_vocab_size, dec_vocab_size, max_len, device, drop_prob = 0.3
    ) -> None:
        super().__init__()
        self.encoder = Encoder(
            num_head, embedding_dim, dim_key, dim_value, feed_forward_dim, n_blocks, enc_vocab_size, max_len, device, drop_prob
        )
        self.decoder = Decoder(
            num_head, embedding_dim, dim_key, dim_value, feed_forward_dim, n_blocks, dec_vocab_size, max_len, device, drop_prob
        )
    def forward(self, source, target = None):

        H = self.encoder(source)
        
        output = self.decoder(target, H)

        return output
    
    def predict(self, input):
        pass

# Conf

In [None]:

# GPU device setting
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# model parameter setting
batch_size = 128
max_len = 64
d_model = 512
d_k = 64
d_v = 64
n_layers = 6
n_heads = 8
ffn_hidden = 2048
drop_prob = 0.1

# optimizer parameter setting
init_lr = 1e-5
factor = 0.9
adam_eps = 5e-9
patience = 10
warmup = 100
epoch = 1000
clip = 1.0
weight_decay = 5e-4
inf = float('inf')

# Util

In [None]:
def choice_sample_p(data, size):
    np.random.seed(2206)
    sample_p_idx = np.random.choice(np.arange(len(data)), size = size, replace= False)
    sample_p_choice = np.isin(np.arange(len(data)), sample_p_idx)
    sample_p_data = np.array(data)[sample_p_choice]
    return sample_p_data

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def get_bleu(hypotheses, reference):
    """Get validation BLEU score for dev set."""
    smoothie = SmoothingFunction().method1
    bleu_score = sentence_bleu([reference], hypotheses, smoothing_function=smoothie)*100
    return bleu_score

def idx_to_word(x, vocab):
    words = []
    for i in x:
        word = vocab.get_itos()[i]
        if '<' not in word:
            words.append(word)
    words = " ".join(words)
    return words

# Data

In [None]:
eos_token = "<eos>"
sos_token = "<sos>"
pad_token = "<pad>"
unk_token = "<unk>"
max_length = max_len
min_freq = 2

# đọc file trainen, trainvi, tạo thành các list
def load_file(file_path):
    sentences = []
    with open(file_path, 'r') as file:
        content = file.read()
        sentences = content.split('\n')
    new_sentences = []
    for sentence in sentences:
        tokens = []
        for token in sentence.split():
            if token not in punctuation:
                tokens.append(token)
        new_sentence = ' '.join(tokens)
        new_sentences.append(new_sentence)
    return new_sentences

def tokenizer(sentences, lower, eos_token, sos_token, max_length):
    return [sos_token] + html.unescape(
        sentences.lower() if lower else sentences
    ).split(' ')[:(max_length - 2)] + [eos_token]

# train
train_en_file = "/kaggle/input/data-translation-133k/data/train/train.en"
train_vi_file = "/kaggle/input/data-translation-133k/data/train/train.vi"

train_en = load_file(train_en_file)
train_vi = load_file(train_vi_file)

train_en_tokens = [tokenizer(sent,True, eos_token, sos_token, max_length) for sent in train_en]
train_vi_tokens = [tokenizer(sent,True, eos_token, sos_token, max_length) for sent in train_vi]

# test
test_en_file = "/kaggle/input/data-translation-133k/data/test/test.en"
test_vi_file = "/kaggle/input/data-translation-133k/data/test/test.vi"

test_en = load_file(test_en_file)
test_vi = load_file(test_vi_file)

test_en_tokens = [tokenizer(sent,True, eos_token, sos_token, max_length) for sent in test_en]
test_vi_tokens = [tokenizer(sent,True, eos_token, sos_token, max_length) for sent in test_vi]

#dev
dev_en_file = "/kaggle/input/data-translation-133k/data/dev/dev.en"
dev_vi_file = "/kaggle/input/data-translation-133k/data/dev/dev.vi"

dev_en = load_file(dev_en_file)
dev_vi = load_file(dev_vi_file)

dev_en_tokens = [tokenizer(sent,True, eos_token, sos_token, max_length) for sent in dev_en]
dev_vi_tokens = [tokenizer(sent,True, eos_token, sos_token, max_length) for sent in dev_vi]

# vocab
special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token
]
en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_en_tokens, min_freq,special_tokens,
)
vi_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_vi_tokens, min_freq, special_tokens
)

assert en_vocab[unk_token] == vi_vocab[unk_token]
assert en_vocab[pad_token] == vi_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

en_vocab.set_default_index(unk_index)
vi_vocab.set_default_index(unk_index)

en_vocab_size = len(en_vocab)
vi_vocab_size = len(vi_vocab)

#train
train_en_ids = [en_vocab.lookup_indices(toks) for toks in train_en_tokens]
train_vi_ids = [vi_vocab.lookup_indices(toks) for toks in train_vi_tokens]

#test
test_en_ids = [en_vocab.lookup_indices(toks) for toks in test_en_tokens]
test_vi_ids = [vi_vocab.lookup_indices(toks) for toks in test_vi_tokens]

#dev
dev_en_ids = [en_vocab.lookup_indices(toks) for toks in dev_en_tokens]
dev_vi_ids = [vi_vocab.lookup_indices(toks) for toks in dev_vi_tokens]

class Bitext(Dataset):
    def __init__(self, src, trg) -> None:
        super().__init__()
        self.src = src
        self.trg = trg
    def __len__(self):
        return len(self.src)
    def __getitem__(self, index):
        return self.src[index], self.trg[index]
    
train_dataset = Bitext(train_en_ids, train_vi_ids)
test_dataset = Bitext(test_en_ids, test_vi_ids)
dev_dataset = Bitext(dev_en_ids, dev_vi_ids)


# DataLoader

In [None]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        src, trg = zip(*batch)

        src = [torch.LongTensor(toks) for toks in src]
        trg = [torch.LongTensor(toks) for toks in trg]
        
        src = nn.utils.rnn.pad_sequence(src, batch_first=True, padding_value=pad_index)
        trg = nn.utils.rnn.pad_sequence(trg, batch_first=True, padding_value=pad_index)

        return src, trg

    return collate_fn

def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle
    )
    return data_loader

train_dataloader = get_data_loader(
    train_dataset, batch_size, pad_index, True
)

test_dataloader = get_data_loader(
    test_dataset, batch_size, pad_index, True
)

dev_dataloader = get_data_loader(
    dev_dataset, batch_size, pad_index, True
)



# Run

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.kaiming_uniform(m.weight.data)

In [None]:
model = Transformer(
    num_head= n_heads, 
    embedding_dim = d_model, 
    dim_key = d_k, 
    dim_value= d_v, 
    feed_forward_dim = ffn_hidden, 
    n_blocks= n_layers, 
    enc_vocab_size = en_vocab_size, 
    dec_vocab_size= vi_vocab_size, 
    max_len = max_len, 
    device = device,
    drop_prob = drop_prob
).to(device)
print(f'The model has {count_parameters(model):,} trainable parameters')

In [None]:
model.apply(initialize_weights)
optimizer = Adam(params=model.parameters(),
                 lr=init_lr,
                 weight_decay=weight_decay,
                 eps=adam_eps)

In [None]:
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 verbose=True,
                                                 factor=factor,
                                                 patience=patience)

criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [None]:
def train(model, iterator, optimizer, criterion, clip, device):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src, trg = batch
        src = src.to(device)
        trg = trg.to(device)

        optimizer.zero_grad()
        output = model(src, trg[:, :-1])
        output_reshape = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:, 1:].contiguous().view(-1)

        loss = criterion(output_reshape, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()
        print('step :', round((i / len(iterator)) * 100, 2), '% , loss :', loss.item())

    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion, device):
    model.eval()
    epoch_loss = 0
    batch_bleu = []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src, trg = batch
            src = src.to(device)
            trg = trg.to(device)
            
            output = model(src, trg[:, :-1])
            output_reshape = output.contiguous().view(-1, output.shape[-1])
            trg = trg[:, 1:].contiguous().view(-1)

            loss = criterion(output_reshape, trg)
            epoch_loss += loss.item()

            total_bleu = []

            src, trg = batch
            for j in range(len(trg)):
                try:
                    trg_words = idx_to_word(trg[j], vi_vocab)
                    output_words = output[j].argmax(1)
                    output_words = idx_to_word(output_words, vi_vocab)
                    ble = get_bleu(hypotheses=output_words.split(), reference=trg_words.split())
                    total_bleu.append(ble)
                except:
                    pass

            batch_bleu.append(np.mean(total_bleu))

    batch_bleu = np.mean(batch_bleu)
    return epoch_loss / len(iterator), batch_bleu

In [None]:
# from util.util import *
def run(model, total_epoch, best_loss, device):
    train_losses, test_losses, bleus = [], [], []
    for step in range(total_epoch):
        start_time = time.time()
        train_loss = train(model, train_dataloader, optimizer, criterion, clip, device)
        valid_loss, bleu = evaluate(model, dev_dataloader, criterion, device) # tính chỉ số bleu trên mỗi epoch
        end_time = time.time()

        if step > warmup:
            scheduler.step(valid_loss)

        train_losses.append(train_loss)
        test_losses.append(valid_loss)
        bleus.append(bleu)
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_loss:
            best_loss = valid_loss
#             torch.save(model.state_dict(), '/kaggle/working/model-{0}.pt'.format(valid_loss))
        if step % 20 == 0:
            torch.save(model.state_dict(), '/kaggle/working/model-{0}.pt'.format(step))

        f = open('/kaggle/working/train_loss.txt', 'w') # lưu lại train_loss để vẽ đồ thị
        f.write(str(train_losses))
        f.close()

        f = open('/kaggle/working/bleu.txt', 'w')
        f.write(str(bleus))
        f.close()

        f = open('/kaggle/working/valid_loss.txt', 'w') # lưu lại valid_loss để vẽ đồ thị
        f.write(str(test_losses))
        f.close()

        print(f'Epoch: {step + 1} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\tVal Loss: {valid_loss:.3f} |  Val PPL: {math.exp(valid_loss):7.3f}')
        print(f'\tBLEU Score: {bleu:.3f}')
    # tính loss, bleu trên tập test:
    test_loss, bleu_test = evaluate(model, test_dataloader, criterion, device)
    print(f"\tTest Loss: {test_loss:.3f} | Bleu Score of Test: {bleu_test:.3f}")
    f = open('/kaggle/working/result_test_loss.txt', 'w')
    f.write(str(test_loss))
    f.close()

    f = open('/kaggle/working/result_Bleu_Test.txt', 'w')
    f.write(str(bleu_test))
    f.close()
#     return train_losses, test_losses, bleus


In [None]:
# epoch = 1
run(model = model,total_epoch = epoch, best_loss=inf, device = device)
torch.save(model.state_dict(), '/kaggle/working/model-official.pt')
# model = torch.load('result/model-180.pt')
# model.load_state_dict(torch.load('result/model-180.pt'))
# epoch = 1
# run(model = model,total_epoch = epoch, best_loss=inf, device = device)
# model.load_state_dict(model_state_dict)

In [None]:
# model = torch.load('result/model-180.pt')
# model.load_state_dict(torch.load('/kaggle/input/model-transformer-translation/model-980.pt'))

In [None]:
#dịch: 
for batch in test_dataloader:
    src, trg = batch
    src = src.to(device)
    trg = trg.to(device)
    break
output = model(src, trg[:, :-1])

trg_words = idx_to_word(trg[0], vi_vocab)
output_words = output[0].argmax(1)
output_words = idx_to_word(output_words, vi_vocab)
print(f"Bản gốc: {trg_words} \n Bản dịch: {output_words}")