In [1]:
# ## Download en_core_web_lg ##
# !python -m spacy download en_core_web_lg
# !pip install pyvi

In [2]:
import torch, requests, copy, torchtext, pickle, spacy, html, math,time , numpy as np, pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.legacy import data
from torchtext.legacy.data import Field, BucketIterator, TabularDataset
from torch.autograd import Variable
from pyvi import ViTokenizer
from nltk.corpus import wordnet
en_t = spacy.load('en_core_web_lg')
vi_t = ViTokenizer

def tokenize_en(sentence):
    return [token.text for token in en_t.tokenizer(sentence)]
def tokenize_vi(sentence):
#     return sentence.split()
    return vi_t.tokenize(sentence).split()

In [3]:
LOADMODEL = True ### (False) for Train model / (True) for Test pretrained model ###

EPOCH = 10
BATCHSIZE = 2500
LR = 0.001
DEVICE ="cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

KT=5 ## if KT = 1 => GREEDYSEARCH KT > 1 BEAMSEARCH ##
MAX_LEN=200
TRANSLATE_LEN = 50


if LOADMODEL == True:
    MODELNAME = "IWSLT15-TRANSFORMER.model"
else:
    MODELNAME = "IWSLT15-TRANSFORMER.model"

cuda


## **PREPARE DATASET** ##

In [4]:
## Get dataset from server and load to LIST ##
# url="https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/"
# train_en = [line for line in requests.get(url+"train.en").text.splitlines()]
# train_vi = [line for line in requests.get(url+"train.vi").text.splitlines()]
# test_en = [line for line in requests.get(url+"tst2013.en").text.splitlines()]
# test_vi = [line for line in requests.get(url+"tst2013.vi").text.splitlines()]

# Get dataset from local
f = open("iwslt15/train.en",encoding="utf8")
train_en = [line for line in f]
f.close()
f = open("iwslt15/train.vi",encoding="utf8")
train_vi = [line for line in f]
f.close()
f = open("iwslt15/tst2013.en",encoding="utf8")
test_en = [line for line in f]
f.close()
f = open("iwslt15/tst2013.vi",encoding="utf8")
test_vi = [line for line in f]
f.close()

In [5]:
## Convert LIST to DATAFRAME ##

def create_df(data_en,data_vi):
    raw_data = {"English":[line for line in data_en],"Vietnamese":[line for line in data_vi]}
    df = pd.DataFrame(raw_data)
    df['EN_LEN']=df['English'].str.count(" ")
    df['VI_LEN']=df['Vietnamese'].str.count(" ")
    df = df.query("VI_LEN < %s & EN_LEN < %s"%(MAX_LEN,MAX_LEN)) # Get sentence with len < 200 
    #df = df.query("EN_LEN < VI_LEN *1.5 & EN_LEN*1.5 > VI_LEN")
    return df

train = create_df(train_en,train_vi)
test = create_df(test_en,test_vi)

train.to_csv("train.csv",index=False,encoding="utf-8")
test.to_csv("test.csv",index=False,encoding="utf-8")

In [6]:
EN_TEXT = Field(tokenize=tokenize_en,lower=True)
VI_TEXT = Field(tokenize=tokenize_vi,init_token="<cls>",eos_token="<eos>",lower=True)

In [7]:
data_fields=[('English',EN_TEXT),("Vietnamese",VI_TEXT)]
train_data,test_data = TabularDataset.splits(path="./",train='train.csv',validation='test.csv', format='csv', fields = data_fields)

In [8]:
## CREATE VOCAB ##
EN_TEXT.build_vocab(train_data,vectors = "glove.6B.100d", min_freq=3)
VI_TEXT.build_vocab(train_data, min_freq=3)

print(f"Number tokens in English vocabulary: {len(EN_TEXT.vocab)}")
print(f"Number tokens in Vietnamese vocabulary: {len(VI_TEXT.vocab)}")

Number tokens in English vocabulary: 21892
Number tokens in Vietnamese vocabulary: 15709


In [9]:
print(vars(test_data.examples[1]))

{'English': ['when', 'i', 'was', 'little', ',', 'i', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'i', 'grew', 'up', 'singing', 'a', 'song', 'called', '&', 'quot', ';', 'nothing', 'to', 'envy', '.', '&', 'quot', ';'], 'Vietnamese': ['khi', 'tôi', 'còn', 'nhỏ', ',', 'tôi', 'nghĩ', 'rằng', 'bắctriều', 'tiên', 'là', 'đất_nước', 'tốt', 'nhất', 'trên', 'thế_giới', 'và', 'tôi', 'thường', 'hát', 'bài', '&', 'quot', ';', 'chúng_ta', 'chẳng', 'có', 'gì', 'phải', 'ghen_tị', '.', '&', 'quot', ';']}


## **CREATE BATCH AND MASK** ##

In [10]:
## CREATE BATCH NO PADDING ##

class AutoIterator(data.Iterator):
    def create_batches(self):
        if self.train:
            def pool(d,random_shuffler):
                for p in data.batch(d,self.batch_size * 100):
                    p_batch = data.batch(sorted(p, key= self.sort_key),self.batch_size,self.batch_size_fn)
                    for b in random_shuffler(list(p_batch)):
                        yield b

            self.batches = pool(self.data(),self.random_shuffler)
        else:
            self.batches=[]
            for b in data.batch(self.data(),self.batch_size,self.batch_size_fn):
                self.batches.append(sorted(b,key=self.sort_key))

global max_text_in_batch, max_label_in_batch

def batch_size_fn(new, count,sofar):
    global max_text_in_batch, max_label_in_batch
    if count == 1:
        max_text_in_batch = 0
        max_label_in_batch = 0
    max_text_in_batch = max(max_text_in_batch, len(new.English))
    max_label_in_batch = max(max_label_in_batch,len(new.Vietnamese)+2)

    text_elements = count * max_text_in_batch
    label_elements = count * max_label_in_batch

    return max(text_elements,label_elements)

In [11]:
## CREATE MASK ##

def nopeak_mask(size):
    np_mask = np.triu(np.ones((1,size,size)),k=1).astype('uint8')
    np_mask = Variable(torch.from_numpy(np_mask)==0)
    return np_mask.to(DEVICE)

def create_masks(src,trg):
    en_mask = (src != EN_TEXT.vocab.stoi['<pad>']).unsqueeze(-2)

    if trg is not None:
        vi_mask = (trg != VI_TEXT.vocab.stoi['<pad>']).unsqueeze(-2)

        size = trg.size(1)
        np_mask = nopeak_mask(size)
        vi_mask = vi_mask & np_mask
    else:
        vi_mask = None

    return en_mask, vi_mask

In [12]:
train_iter = AutoIterator(train_data,batch_size=BATCHSIZE,sort_key= lambda x: (len(x.English),len(x.Vietnamese)),device=DEVICE,
                          batch_size_fn=batch_size_fn,train=True,shuffle=True)

In [13]:
## GET NUMBER OF BATCH ##
for c,batch in enumerate(train_iter):
    pass
TRAIN_LEN =c
print(TRAIN_LEN)

1316


## **EMBEDDING** ##

In [14]:
class Embedder(nn.Module):
    def __init__(self,vocab_size, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embed = nn.Embedding(vocab_size,hidden_dim)

    def forward(self,x):
#         print(type(x))
#         print(x.is_cuda)
        return self.embed(x)

class PositionalEncoder(nn.Module):
    def __init__(self,hidden_dim, max_seq_len = 800, dropout =0.1):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(max_seq_len, hidden_dim)

        for pos in range(max_seq_len):
            for i in range(0, hidden_dim, 2):
                pe[pos,i] = math.sin(pos / (10000 ** ((2*i)/hidden_dim)))
                pe[pos,i+1] = math.cos(pos / (10000 **((2*(i+1))/hidden_dim)))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe',pe)

    def forward(self,x):
        x = x * math.sqrt(self.hidden_dim)
        seq_len = x.size(1)
        pe = Variable(self.pe[:,:seq_len],requires_grad=False)
        
        x = x + pe
        return self.dropout(x)

## **SUBLAYERS** ##

In [15]:
## Normalize ##
class Norm(nn.Module):
    def __init__(self, hidden_dim, eps = 1e-6):
        super().__init__()
        self.size = hidden_dim

        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))

        self.eps = eps

    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1,keepdim=True))/(x.std(dim=-1,keepdim=True)+self.eps) + self.bias
        return norm

In [16]:
## Attention ##
def attention(q,k,v,d_k,mask=None,dropout=None):
    scores = torch.matmul(q,k.transpose(-2,-1))/math.sqrt(d_k)
    if mask is not None:
        mask = mask.unsqueeze(1)
        scores = scores.masked_fill(mask == 0, -1e9)

    scores = F.softmax(scores,dim=-1)

    if dropout is not None:
        scores = dropout(scores)
    output = torch.matmul(scores,v)
    return output

In [17]:
## MultiHeadAttention ##
class MultiHeadAttention(nn.Module):
    def __init__(self,heads, hidden_dim,dropout =0.1):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.d_k = hidden_dim // heads
        self.h = heads

        self.q_linear = nn.Linear(hidden_dim,hidden_dim)
        self.v_linear = nn.Linear(hidden_dim,hidden_dim)
        self.k_linear = nn.Linear(hidden_dim,hidden_dim)

        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(hidden_dim,hidden_dim)

    def forward(self, q, k, v , mask = None):
        bs = q.size(0)

        q  = self.q_linear(q).view(bs,-1,self.h ,self.d_k)
        q = q.transpose(1,2)

        v  = self.v_linear(v).view(bs,-1,self.h ,self.d_k)
        v = v.transpose(1,2)

        k  = self.k_linear(k).view(bs,-1,self.h ,self.d_k)
        k = k.transpose(1,2)

        scores = attention(q,k,v,self.d_k,mask,self.dropout)

        concat = scores.transpose(1,2).contiguous().view(bs,-1,self.hidden_dim)

        output = self.out(concat)

        return output

In [18]:
class FeedForward(nn.Module):
    def __init__(self,hidden_dim, hidden_dim_ff=2048,dropout =0.1):
        super().__init__()

        self.l1 = nn.Linear(hidden_dim,hidden_dim_ff)
        self.dropout = nn.Dropout(dropout)
        self.l2 = nn.Linear(hidden_dim_ff,hidden_dim)

    def forward(self,x):
        x = self.dropout(F.relu(self.l1(x)))
        x = self.l2(x)
        return x

## **ENCODER LAYER AND DECODER LAYER** ##

In [19]:
## ENCODER LAYER ##

class EncoderLayer(nn.Module):
    def __init__(self, hidden_dim, heads, dropout =0.1):
        super().__init__()
        self.norm = Norm(hidden_dim)
        self.attn = MultiHeadAttention(heads,hidden_dim,dropout=dropout)
        self.ff = FeedForward(hidden_dim,dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self,x,mask):
        x_norm = self.norm(x)
        x = x + self.dropout(self.attn(x_norm,x_norm,x_norm,mask))
        x_norm = self.norm(x)
        x = x + self.dropout(self.ff(x_norm))

        return x

In [20]:
## CREATE LAYERS ##
def clone_module(module,N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [21]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, hidden_dim, N , heads, dropout):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, hidden_dim)
        self.pe = PositionalEncoder(hidden_dim,dropout=dropout)
        self.layers = clone_module(EncoderLayer(hidden_dim,heads,dropout),N)
        self.norm = Norm(hidden_dim)

    def forward(self,en,mask):
        x = self.embed(en)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x,mask)
        
        return self.norm(x)

In [22]:
## DECODER LAYER ##

class DecoderLayer(nn.Module):
    def __init__(self, hidden_dim, heads ,dropout=0.1):
        super().__init__()
        self.norm = Norm(hidden_dim)
        self.dropout= nn.Dropout(dropout)
        self.attn = MultiHeadAttention(heads,hidden_dim,dropout=dropout)
        self.ff = FeedForward(hidden_dim,dropout=dropout)

    def forward(self,x,emb_outputs,en_mask,vi_mask):
        x_norm = self.norm(x)
        x = x + self.dropout(self.attn(x_norm,x_norm,x_norm,vi_mask))
        x_norm = self.norm(x)
        x = x + self.dropout(self.attn(x_norm,emb_outputs,emb_outputs,en_mask))
        x_norm = self.norm(x)
        x = x+self.dropout(self.ff(x_norm))

        return x

In [23]:
class Decoder(nn.Module):
    def __init__(self,vocab_size, hidden_dim, N ,heads, dropout):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, hidden_dim)
        self.pe = PositionalEncoder(hidden_dim,dropout=dropout)
        self.layers = clone_module(DecoderLayer(hidden_dim,heads,dropout),N)
        self.norm = Norm(hidden_dim)
    
    def forward(self,trg,emb_outputs,en_mask,vi_mask):
        x = self.embed(trg)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x,emb_outputs,en_mask,vi_mask)
        
        return self.norm(x)

## **TRANSFORMER** ##

In [24]:
class Transformer(nn.Module):
    def __init__(self, en_vocab,vi_vocab, hidden_dim, N, heads ,dropout):
        super().__init__()
        self.encoder = Encoder(en_vocab,hidden_dim,N,heads,dropout)
        self.decoder = Decoder(vi_vocab,hidden_dim,N,heads,dropout)
        self.out = nn.Linear(hidden_dim,vi_vocab)

    def forward(self,en,vi,en_mask,vi_mask):
        enc_outputs = self.encoder(en,en_mask)

        dec_output = self.decoder(vi,enc_outputs,en_mask,vi_mask)

        output = self.out(dec_output)
        return output

In [25]:
## GET MODEL ##
def get_model(en_vocab, vi_vocab, hidden_dim,n_layers,heads,dropout,load=False):
    model = Transformer(en_vocab,vi_vocab,hidden_dim,n_layers,heads,dropout)
    if load==True:
        print("Model loaded")
        model=torch.load(MODELNAME)
    else:        
        for p in model.parameters():
            if p.dim()>1:
                nn.init.xavier_uniform_(p)
    return model.to(DEVICE)

## **TRAIN MODEL AND TEST MODEL** ##

In [26]:
# GET LEN VOCAB #
en_vocab = len(EN_TEXT.vocab)
vi_vocab = len(VI_TEXT.vocab)

In [27]:
## SETUP MODEL ##

HIDDEN_DIM = 512
HEADS = 8
LAYERS = 3
DROPOUT = 0.1

model = get_model(en_vocab,vi_vocab,HIDDEN_DIM,LAYERS,HEADS,DROPOUT,load=LOADMODEL)

Model loaded


In [28]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

if LOADMODEL == False:  ## RESET WEIGHT OF MODEL BEFORE TRAIN ##          
    model.apply(init_weights)

## **TRAIN MODEL AND TEST MODEL** ##

In [29]:
###########################
## IMPROVE TEACHING RATE ##
###########################

class CosineWithRestarts(torch.optim.lr_scheduler._LRScheduler):

    def __init__(self,
                 optimizer: torch.optim.Optimizer,
                 T_max: int,
                 eta_min: float = 0.,
                 last_epoch: int = -1,
                 factor: float = 1.) -> None:

        self.T_max = T_max
        self.eta_min = eta_min
        self.factor = factor
        self._last_restart: int = 0
        self._cycle_counter: int = 0
        self._cycle_factor: float = 1.
        self._updated_cycle_len: int = T_max
        self._initialized: bool = False
        super(CosineWithRestarts, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        if not self._initialized:
            self._initialized = True
            return self.base_lrs

        step = self.last_epoch + 1
        self._cycle_counter = step - self._last_restart

        lrs = [
            (
                self.eta_min + ((lr - self.eta_min) / 2) *
                (
                    np.cos(
                        np.pi *
                        ((self._cycle_counter) % self._updated_cycle_len) /
                        self._updated_cycle_len
                    ) + 1
                )
            ) for lr in self.base_lrs
        ]

        if self._cycle_counter % self._updated_cycle_len == 0:
            self._cycle_factor *= self.factor
            self._cycle_counter = 0
            self._updated_cycle_len = int(self._cycle_factor * self.T_max)
            self._last_restart = step

        return lrs

In [30]:
## TRAIN MODEL ##
global TTIME,LOSS
def train_model(model):
    best_loss =99999999
    model.train()
    start = time.time()
    optimizer = optim.Adam(model.parameters(),lr=LR,betas=(0.9,0.98),eps = 1e-9)
    sched = CosineWithRestarts(optimizer, T_max=TRAIN_LEN)
    global TTIME,pred_temp,LOSS
    LOSS=[]
    for epoch in range(EPOCH):
        total_loss = 0

        for i , batch in enumerate(train_iter):
            
            ben = batch.English.transpose(0,1)
            bvi = batch.Vietnamese.transpose(0,1)
            bvi_input = bvi[:,:-1]
            en_mask,vi_mask = create_masks(ben,bvi_input)
            
            preds = model(ben,bvi_input,en_mask,vi_mask)
            pred_temp = preds
            ys = bvi[:,1:].contiguous().view(-1)
            optimizer.zero_grad()
            loss = F.cross_entropy(preds.view(-1,preds.size(-1)),ys,ignore_index=VI_TEXT.vocab.stoi['<pad>'])
            loss.backward()
            optimizer.step()
            sched.step()
            total_loss +=loss.item()
            if (i)%100 == 0:
                p = int(100*(i+1)/((TRAIN_LEN//100)*100))
                avg_loss = total_loss/100
                print(">>>Time %3d min : proc [%s%s] %3d%% : step %6d  : loss = %.5f "\
                      %((time.time()-start)//60,"".join("#"*(p//5)),"".join('_'*(20-(p//5))),p,i,avg_loss,))
        print("Epoch",epoch + 1,": loss",total_loss/TRAIN_LEN)
        LOSS.append(total_loss/TRAIN_LEN)
        torch.save(model, MODELNAME)
    TTIME=(time.time()-start)//60

#### **BEAM SEARCH** ####

In [31]:
def init_vars(src, model, SRC, TRG):
    
    init_tok = TRG.vocab.stoi['<cls>']
    src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
    e_output = model.encoder(src, src_mask)
    
    outputs = torch.LongTensor([[init_tok]]).to(DEVICE)
    
    trg_mask = nopeak_mask(1)
    
    out = model.out(model.decoder(outputs,e_output, src_mask, trg_mask))
    out = F.softmax(out, dim=-1)

    probs, ix = out[:, -1].data.topk(KT)
    log_scores = torch.Tensor([math.log(prob) for prob in probs.data[0]]).unsqueeze(0)
    
    outputs = torch.zeros(KT, TRANSLATE_LEN).long().to(DEVICE)

    outputs[:, 0] = init_tok
    outputs[:, 1] = ix[0]
    
    e_outputs = torch.zeros(KT, e_output.size(-2),e_output.size(-1)).to(DEVICE)

    e_outputs[:, :] = e_output[0]
    
    return outputs, e_outputs, log_scores

In [32]:
def k_best_outputs(outputs, out, log_scores, i, k):
    
    probs, ix = out[:, -1].data.topk(k)
    log_probs = torch.Tensor([math.log(p) for p in probs.data.view(-1)]).view(k, -1) + log_scores.transpose(0,1)
    k_probs, k_ix = log_probs.view(-1).topk(k)
    
    row = k_ix // k
    col = k_ix % k

    outputs[:, :i] = outputs[row, :i]
    outputs[:, i] = ix[row, col]

    log_scores = k_probs.unsqueeze(0)
    
    return outputs, log_scores

In [33]:
def beam_search(src, model, SRC, TRG):    

    outputs, e_outputs, log_scores = init_vars(src, model, SRC, TRG)
    eos_tok = TRG.vocab.stoi['<eos>']
    src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
    ind = None
    for i in range(2, TRANSLATE_LEN):
    
        trg_mask = nopeak_mask(i)

        out = model.out(model.decoder(outputs[:,:i],e_outputs, src_mask, trg_mask))

        out = F.softmax(out, dim=-1)
    
        outputs, log_scores = k_best_outputs(outputs, out, log_scores, i, KT)
        
        ones = (outputs==eos_tok).nonzero() # Occurrences of end symbols for all input sentences.
        sentence_lengths = torch.zeros(len(outputs), dtype=torch.long).to(DEVICE)
        for vec in ones:
            i = vec[0]
            if sentence_lengths[i]==0: # First end symbol has not been found yet
                sentence_lengths[i] = vec[1] # Position of first end symbol

        num_finished_sentences = len([s for s in sentence_lengths if s > 0])

        if num_finished_sentences == KT:
            alpha = 0.7
            div = 1/(sentence_lengths.type_as(log_scores)**alpha)
            _, ind = torch.max(log_scores * div, 1)
            ind = ind.data[0]
            break
    
    if ind is None:
        #length = (outputs[0]==eos_tok).nonzero()[0]
        length = len(outputs[0])
        return [TRG.vocab.itos[tok] for tok in outputs[0,1:length]]
    
    else:
        length = (outputs[ind]==eos_tok).nonzero()[0]
        return [TRG.vocab.itos[tok] for tok in outputs[ind][1:length]]

#### **TRANSLATE** ####

In [34]:
def get_synonym(word,en):
    syns = wordnet.synsets(word)
    for s in syns:
        for l in s.lemmas():
            if en.vocab.stoi[l.name()] != 0:
                return en.vocab.stoi[l.name()]
    return 0

In [35]:
## CONVERT ID TO WORD ##
def translate_sentence(sentence,model,en,vi):
    
    src_idx = [2]
    
    for token in sentence:
        if en.vocab.stoi[token] !=0:
            src_idx.append(en.vocab.stoi[token])
        else:
            src_idx.append(get_synonym(token,en))
    src_idx.append(3)
    sentence = Variable(torch.LongTensor([src_idx]).to(DEVICE))
    
    sentence = beam_search(sentence,model,en,vi)
    return sentence

In [36]:
## TEST MODEL ##
def test_model(model,en,vi):
    model.eval()
    refs = []
    preds = []
    with torch.no_grad():
        for i, batch in enumerate(test_data):
            if i == 0:
                continue
            ben = batch.English
            bvi = batch.Vietnamese
            pred = translate_sentence(ben,model,en,vi)
            print("EN_INPUT",ben)
            print("NMT_PRED",pred)
            print("VI_INPUT",bvi)
            refs.append([bvi])
            preds.append(pred)
    return refs,preds

In [37]:
%%time
if LOADMODEL==False:    
    train_model(model)

Wall time: 0 ns


In [38]:
## TEST ##
ref,pred=test_model(model,EN_TEXT,VI_TEXT)

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ..\aten\src\ATen\native\BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


EN_INPUT ['when', 'i', 'was', 'little', ',', 'i', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'i', 'grew', 'up', 'singing', 'a', 'song', 'called', '&', 'quot', ';', 'nothing', 'to', 'envy', '.', '&', 'quot', ';']
NMT_PRED ['khi', 'tôi', 'còn', 'nhỏ', ',', 'tôi', 'nghĩ', 'đất_nước', 'tôi', 'là', 'nước', 'tốt', 'nhất', 'trên', 'hành_tinh', ',', 'và', 'tôi', 'lớn', 'lên', 'hát', '&', 'quot', ';', 'không', 'có', 'gì', 'cả', '.', '&', 'quot', ';']
VI_INPUT ['khi', 'tôi', 'còn', 'nhỏ', ',', 'tôi', 'nghĩ', 'rằng', 'bắctriều', 'tiên', 'là', 'đất_nước', 'tốt', 'nhất', 'trên', 'thế_giới', 'và', 'tôi', 'thường', 'hát', 'bài', '&', 'quot', ';', 'chúng_ta', 'chẳng', 'có', 'gì', 'phải', 'ghen_tị', '.', '&', 'quot', ';']
EN_INPUT ['and', 'i', 'was', 'very', 'proud', '.']
NMT_PRED ['và', 'tôi', 'rất', 'tự_hào', '.']
VI_INPUT ['tôi', 'đã', 'rất', 'tự_hào', 'về', 'đất_nước', 'tôi', '.']
EN_INPUT ['in', 'school', ',', 'we', 'spent', 'a', 'lot', 'of', 'time', 'stud

In [39]:
## CALCULATE BLEU ##
bleu= torchtext.data.metrics.bleu_score(pred,ref)
print("total:",len(test_data))
print("bleu: %.2f %%"%(bleu*100))

total: 1269
bleu: 19.21 %


In [40]:
## SHOW TRANSFORMER TRAIN LOSS ##

if LOADMODEL == False:
    te=[i for i in  LOSS]
    import matplotlib.pyplot as plt
    plt.rcParams.update({'font.size': 14})
    fig,ax = plt.subplots()
    fig.set_figwidth(10)
    fig.set_figheight(6)
    fig.patch.set_facecolor('xkcd:white')
    ax.plot([i for i in range(EPOCH)],te,label='Loss')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss')
    ax.set_title("Transformer Train Loss")
    ax.legend()