In [1]:
device = 'cuda'

# model parameter setting
batch_size = 164
max_len = 100
d_model = 256
n_layers = 2
n_heads = 8
ffn_hidden = 512
drop_prob = 0.1

# optimizer parameter setting
init_lr = 1e-3
factor = 0.6
adam_eps = 5e-9
patience = 7
warmup = 100
epoch = 500
clip = 1.0
weight_decay = 5e-4 ###
inf = float('inf')

import torch
from torch import nn, optim
from torch.optim import Adam
import torch.utils.data as data
import math
from collections import Counter
import collections
import numpy as np
import copy
import time
import spacy
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import time

start_time = time.time()

class DataLoader:
    source: Field = None
    target: Field = None

    def __init__(self, ext, tokenize_en, tokenize_vi, init_token, eos_token):
        self.ext = ext
        self.tokenize_en = tokenize_en
        self.tokenize_vi = tokenize_vi
        self.init_token = init_token
        self.eos_token = eos_token
        print('dataset initializing start')

    def make_dataset(self):
        if self.ext == ('.vi', '.en'):
            self.source = Field(tokenize=self.tokenize_vi, init_token=self.init_token, eos_token=self.eos_token,
                                lower=True, batch_first=True)
            self.target = Field(tokenize=self.tokenize_en, init_token=self.init_token, eos_token=self.eos_token,
                                lower=True, batch_first=True)

        elif self.ext == ('.en', '.vi'):
            self.source = Field(tokenize=self.tokenize_en, init_token=self.init_token, eos_token=self.eos_token,
                                lower=True, batch_first=True)
            self.target = Field(tokenize=self.tokenize_vi, init_token=self.init_token, eos_token=self.eos_token,
                                lower=True, batch_first=True)

        train_data, valid_data, test_data = Multi30k.splits(exts=self.ext, fields=(self.source, self.target))
        return train_data, valid_data, test_data

    def build_vocab(self, train_data, min_freq):
        self.source.build_vocab(train_data, min_freq=min_freq)
        self.target.build_vocab(train_data, min_freq=min_freq)

    def make_iter(self, train, validate, test, batch_size):
        train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train, validate, test),
                                                                              batch_size=batch_size)
        print('dataset initializing done')
        return train_iterator, valid_iterator, test_iterator

from pyvi import ViTokenizer
import spacy
from spacy.language import Language
from spacy.tokens import Doc

# Custom tokenizer function
def custom_tokenizer(nlp, text):
    words = ViTokenizer.tokenize(text).split()
    return Doc(nlp.vocab, words=words)

class Tokenizer:
    def __init__(self):
        self.spacy_vi = spacy.blank("en")
        self.spacy_vi.tokenizer = custom_tokenizer.__get__(self.spacy_vi)
        
        self.spacy_en = spacy.load('en_core_web_sm')

    def tokenize_vi(self, text):
        return [tok.text for tok in self.spacy_vi.tokenizer(text)]

    def tokenize_en(self, text):
        return [tok.text for tok in self.spacy_en.tokenizer(text)]


# In[4]:

tokenizer = Tokenizer()
loader = DataLoader(ext=('.en', '.vi'),
                    tokenize_en=tokenizer.tokenize_en,
                    tokenize_vi=tokenizer.tokenize_vi,
                    init_token='<sos>',
                    eos_token='<eos>')

train, valid, test = loader.make_dataset()
loader.build_vocab(train_data=train, min_freq=1)
train_iter, valid_iter, test_iter = loader.make_iter(train, valid, test,
                                                     batch_size=batch_size)

src_pad_idx = loader.source.vocab.stoi['<pad>']
trg_pad_idx = loader.target.vocab.stoi['<pad>']
trg_sos_idx = loader.target.vocab.stoi['<sos>']

enc_voc_size = len(loader.source.vocab)
dec_voc_size = len(loader.target.vocab)

dataset initializing start
dataset initializing done


In [2]:
class ScaleDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaleDotProductAttention, self).__init__()
        self.softmax = nn.Softmax(dim = -1)

    def forward(self, q, k ,v, mask = None, e = 1e-12):
        batch_size, head, length, d_tensor = k.size()
        k_t = k.transpose(2,3)
        score = (q @ k_t) / math.sqrt(d_tensor)

        if mask is not None:
            score = score.masked_fill(mask == 0, -10000)

        score = self.softmax(score)
        v = score @ v
        return v, score

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        self.n_head = n_head # Number of attention heads
        self.attention = ScaleDotProductAttention()
        self.w_q = nn.Linear(d_model, d_model) # Query transformation
        self.w_k = nn.Linear(d_model, d_model) # Key
        self.w_v = nn.Linear(d_model, d_model) # Value
        self.w_concat = nn.Linear(d_model, d_model)

    def split(self, tensor):
        batch_size, length, d_model = tensor.size()
        d_tensor = d_model // self.n_head
        tensor = tensor.view(batch_size, length, self.n_head, d_tensor).transpose(1,2).to(device)
        return tensor

    def concat(self, tensor):
        batch_size, head, length, d_tensor = tensor.size()
        d_model = head * d_tensor
        tensor = tensor.transpose(1,2).contiguous().view(batch_size, length, d_model).to(device)
        return tensor

    def forward(self, q, k, v, mask = None):
        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v)
        q, k, v = self.split(q), self.split(k), self.split(v)

        out, attention = self.attention(q, k, v, mask = mask)

        out = self.concat(out)
        out = self.w_concat(out)
        return out


# In[7]:


class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p = drop_prob)

    def forward(self, x):
        return self.linear2(self.dropout(self.relu(self.linear1(x))))


# In[8]:


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len):
        super(PositionalEncoding, self).__init__()

        self.encoding = torch.zeros(max_len, d_model).to(device)
        self.encoding.requires_grad = False
        pos = torch.arange(0, max_len, dtype = torch.float).unsqueeze(1).to(device)
        _2i = torch.arange(0, d_model, step = 2).float().to(device)
        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i/d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i/d_model)))

    def forward(self, x):
        batch_size, seq_len = x.size()
        return self.encoding[:seq_len, :].to(device)


# In[9]:


class LayerNorm(nn.Module):
    def __init__(self, d_model, eps=1e-12):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, unbiased=False, keepdim=True)
        # '-1' means last dimension.

        out = (x - mean) / torch.sqrt(var + self.eps)
        out = self.gamma * out + self.beta
        return out

class TokenEmbedding(nn.Embedding):
   def __init__(self, vocab_size, d_model):
       super(TokenEmbedding, self).__init__(vocab_size, d_model, padding_idx=1)

class TransformerEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, drop_prob):
        super(TransformerEmbedding, self).__init__()
        self.tok_emb = TokenEmbedding(vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model, max_len)
        self.drop_out = nn.Dropout(p=drop_prob)
    
    def forward(self, x):
        tok_emb = self.tok_emb(x)
        pos_emb = self.pos_emb(x)
        return self.drop_out(tok_emb + pos_emb)


# In[10]:


class EncoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model, n_head)
        self.ffn = PositionwiseFeedForward(d_model, ffn_hidden, drop_prob)
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = nn.Dropout(p = drop_prob)
        self.dropout2 = nn.Dropout(p = drop_prob)

    def forward(self, x, src_mask):
        _x = x
        x = self.attention(x, x, x, src_mask)
        x = self.dropout1(x)
        x = self.norm1(x + _x)

        _x = x
        x = self.ffn(x)
        x = self.dropout2(x)
        x = self.norm2(x + _x)
        return x

class Encoder(nn.Module):
    def __init__(self, enc_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                        max_len=max_len,
                                        vocab_size=enc_voc_size,
                                        drop_prob=drop_prob)

        self.layers = nn.ModuleList([EncoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])

    def forward(self, x, src_mask):
        x = self.emb(x)

        for layer in self.layers:
            x = layer(x, src_mask)

        return x


# In[11]:


class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model, n_head)
        self.enc_dec_attention = MultiHeadAttention(d_model, n_head)
        self.ffn = PositionwiseFeedForward(d_model, ffn_hidden, drop_prob)
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.norm3 = LayerNorm(d_model)
        self.dropout1 = nn.Dropout(p = drop_prob)
        self.dropout2 = nn.Dropout(p = drop_prob)
        self.dropout3 = nn.Dropout(p = drop_prob)

    def forward(self, dec, enc, trg_mask, src_mask):
        _x = dec
        x = self.self_attention(dec, dec, dec, trg_mask)
        x = self.dropout1(x)
        x = self.norm1(x + _x)

        if enc is not None:
            _x = x
            x = self.enc_dec_attention(q=x, k=enc, v=enc, mask=src_mask)

            x = self.dropout2(x)
            x = self.norm2(x + _x)

        _x = x
        x = self.ffn(x)
        x = self.dropout3(x)
        x = self.norm3(x + _x)
        return x

class Decoder(nn.Module):
    def __init__(self, dec_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                        drop_prob=drop_prob,
                                        max_len=max_len,
                                        vocab_size=dec_voc_size)

        self.layers = nn.ModuleList([DecoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])

        self.linear = nn.Linear(d_model, dec_voc_size)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        trg = self.emb(trg)

        for layer in self.layers:
            trg = layer(trg, enc_src, trg_mask, src_mask)
        output = self.linear(trg)
        return output


# In[12]:


class Transformer(nn.Module):

    def __init__(self, src_pad_idx, trg_pad_idx, trg_sos_idx, enc_voc_size, dec_voc_size, d_model, n_head, max_len,
                 ffn_hidden, n_layers, drop_prob):
        super().__init__()
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.trg_sos_idx = trg_sos_idx
        self.encoder = Encoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               enc_voc_size=enc_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers)

        self.decoder = Decoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               dec_voc_size=dec_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        output = self.decoder(trg, enc_src, trg_mask, src_mask)
        return output

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask.to(device)

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(3)
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones(trg_len, trg_len)).type(torch.ByteTensor).to(device)
        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask.to(device)


# In[13]:


def get_bleu(pred_seq, label_seq, k = 4):
    """Compute the BLEU."""
    pred_tokens, label_tokens = pred_seq.split(' '), label_seq.split(' ')
    len_pred, len_label = len(pred_tokens), len(label_tokens)
    score = math.exp(min(0, 1 - len_label / len_pred))


    for n in range(1, min(k, len_pred) + 1):
        num_matches, label_subs = 0, collections.defaultdict(int)
        for i in range(len_label - n + 1):
            label_subs[' '.join(label_tokens[i: i + n])] += 1

        for i in range(len_pred - n + 1):
            if label_subs[' '.join(pred_tokens[i: i + n])] > 0:
                num_matches += 1
                label_subs[' '.join(pred_tokens[i: i + n])] -= 1
        
        #print(num_matches)

        score *= math.pow(num_matches / (len_pred - n + 1), math.pow(0.5, n))
    return score


def idx_to_word(x, vocab):
    words = []
    for i in x:
        word = vocab.itos[i]
        if '<' not in word:
            words.append(word)
    words = " ".join(words)
    return words


# In[14]:


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.kaiming_uniform(m.weight.data)



loaded_model = Transformer(src_pad_idx=src_pad_idx,
                    trg_pad_idx=trg_pad_idx,
                    trg_sos_idx=trg_sos_idx,
                    d_model=d_model,
                    enc_voc_size=enc_voc_size,
                    dec_voc_size=dec_voc_size,
                    max_len=max_len,
                    ffn_hidden=ffn_hidden,
                    n_head=n_heads,
                    n_layers=n_layers,
                    drop_prob=drop_prob).to(device)
print(f'The model has {count_parameters(loaded_model):,} trainable parameters')


model_path = '/home/k64t/person-reid/MFSvi/models/best_bleu_retrain.pth'
loaded_model.load_state_dict(torch.load(f = model_path))

The model has 30,542,231 trainable parameters


<All keys matched successfully>

In [3]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    batch_bleu = []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg

            src = src.to(device)
            trg = trg.to(device)

            output = model(src, trg[:, :-1])
            output_reshape = output.contiguous().view(-1, output.shape[-1])
            trg = trg[:, 1:].contiguous().view(-1)

            loss = criterion(output_reshape, trg)
            epoch_loss += loss.item()

            total_bleu = []
            for j in range(batch_size):
                try:
                    trg_words = idx_to_word(batch.trg[j], loader.target.vocab)
                    output_words = output[j].max(dim=1)[1]
                    output_words = idx_to_word(output_words, loader.target.vocab)

                    sentences = idx_to_word(batch.src[j], loader.source.vocab)

                    print("Sentence : " + sentences)
                    print("Original : " + trg_words)
                    print("Translator : " + output_words)
                    
                    #print(get_bleu(output_words, trg_words))
                    print('-' * 30)

                    bleu = get_bleu(output_words, trg_words)

                    total_bleu.append(bleu)
                except:
                    pass

            total_bleu = sum(total_bleu) / len(total_bleu)
            batch_bleu.append(total_bleu)

            #src = src.detach()
            #trg = trg.detach()
            torch.cuda.empty_cache()

    batch_bleu = sum(batch_bleu) / len(batch_bleu)
    return epoch_loss / len(iterator), batch_bleu

criterion = nn.CrossEntropyLoss(ignore_index=src_pad_idx)
test_loss, bleu = evaluate(loaded_model, test_iter, criterion)
print(test_loss, bleu)

Sentence : as you can imagine , the town had been devastated .
Original : có_thể_hình_dung , thị_trấn đã bị tàn_phá .
Translator : như rằng thành_phố đã bị tàn_phá . . . . . .
------------------------------
Sentence : who in the room has a mobile phone with you ?
Original : những_ai trong khán_phòng có điện_thoại_di_động ?
Translator : ai trong phòng có một với ? ? ? ? ?
------------------------------
Sentence : i wasn ' t a citizen of that country .
Original : tôi không phải công_dân nước đó .
Translator : tôi không phải là của đó . . . .
------------------------------
Sentence : see , i have a legacy in south central .
Original : tôi có di_sản ở vùng nam_trung .
Translator : tôi có một ở nam miền . . . . . .
------------------------------
Sentence : why in the hell would they not okay this ?
Original : làm_sao họ có_thể không đồng_tình chứ ?
Translator : tại_sao trong không không được gì ? ? ?
------------------------------
Sentence : i ' m always the one taking the picture .
Origina