In [1]:
# We should use GPU to train this model! Thanks
# I recommend Kaggle or Google Colab and of course both are free but have limitations
# MyTransformer.py
# import library
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import copy

# ======================================== Module ========================================
def get_device():
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

def Masking(encoder_batch, decoder_batch, max_length_seq):
    NEG_INFTY = -1e9
    num_sentences = len(encoder_batch)
    look_ahead_mask = torch.full([max_length_seq, max_length_seq] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_length_seq, max_length_seq] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_length_seq, max_length_seq] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_length_seq, max_length_seq] , False)

    for idx in range(num_sentences):
      encoder_sentence_length, decoder_sentence_length = len(encoder_batch[idx]), len(decoder_batch[idx])
      encoder_chars_to_padding_mask = np.arange(encoder_sentence_length + 1, max_length_seq)
      decoder_chars_to_padding_mask = np.arange(decoder_sentence_length + 1, max_length_seq)
      encoder_padding_mask[idx, :, encoder_chars_to_padding_mask] = True
      encoder_padding_mask[idx, encoder_chars_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, decoder_chars_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, decoder_chars_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, encoder_chars_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, decoder_chars_to_padding_mask, :] = True
      

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0).to(get_device())
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0).to(get_device())
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0).to(get_device())
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask
     

class TokenEmbedding(nn.Module):
    
    def __init__(self, vocab_size, d_model):
        """
        Token Embedding is used for converting a word / token into a embedding numeric vector space.
        
        :param vocab_size: Number of words / token in vocabulary
        :param d_model: The embedding dimension
        
        Example: With 1000 words in vocabulary and our embedding dimension is 512, the Token Embedding layer will be 1000x512
        """
        super().__init__()
        self.embedding_layer = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        """
        :param x: the word or sequence of words
        :return: the numerical representation of the input
        
        Example:
        Input: (Batch_size, Sequence of words) - (30x100)
        Output: (Batch_size, Sequence of words, d_model) - (30x100x512)
        """
        x = self.embedding_layer(x)
        return x.to(get_device())

# Or just Simple
# token_embedding = nn.Embedding(vocab_size, d_model)

class PositionalEncoding(nn.Module):
    
    def __init__(self, d_model, max_sequence_length, dropout=0.1):
        """
        Positional Encoding layer for adding positional information to token embeddings.
        
        :param d_model: The embedding dimension.
        :param max_sequence_length: The maximum length of the input sequences.
        :param dropout: Dropout rate.
        """
        super(PositionalEncoding,self).__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)
        PE = self.get()
        self.register_buffer('PE', PE)
        
    def get(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = (torch.arange(self.max_sequence_length)
                          .reshape(self.max_sequence_length, 1))
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        PE = PE.unsqueeze(0)
        return PE


    def forward(self):
        return self.dropout(self.PE)
    
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, num_heads=8, cross=False):
        """
        Multi-Head Attention
        :param d_model: the embedding dimension
        :param num_heads: the number of heads, default equals 8
        :param cross: True for Multi-Head Cross Attention, False for Multi-Head Attention only
        
        # note: The embedding dimension must be divided by the number of heads
        """
        super(MultiHeadAttention,self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.cross = cross

        # query, key value layer
        if self.cross: # Multi-Head Cross Attention
            self.kv_layer = nn.Linear(d_model , 2 * d_model)
            self.q_layer = nn.Linear(d_model , d_model)
        else:
            self.qkv_layer = nn.Linear(d_model , 3 * d_model)
        
        
        # method 1: old, cost alot
        # self.query = nn.Linear(self.head_dim, self.head_dim, bias=False)
        # self.key = nn.Linear(self.head_dim, self.head_dim, bias=False)
        # self.value = nn.Linear(self.head_dim, self.head_dim, bias=False) 

        # method 2: the fewer linear layers the better the cost
        
        
        # Linear Layer in Multi-Head Attention
        self.linear_layer = nn.Linear(d_model, d_model)

    def scaled_dot_product(self, q, k, v, mask=None):
        d_k = q.size()[-1]
        scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
        if mask is not None:
            scaled = scaled.permute(1, 0, 2, 3) + mask
            scaled = scaled.permute(1, 0, 2, 3)
        attention = F.softmax(scaled, dim=-1)
        values = torch.matmul(attention, v)
        return values, attention
    
    def forward(self, x, mask=None):
        """
        Perform forward pass of the multi-head attention mechanism.

        :param x: if cross is True then x is a dictionary including  'encoder_output' and 'w'.
        :param mask: Optional mask tensor
        
        :return: Output tensor of shape (batch_size, length_seq, d_model)

        """

        # For MultiHead Cross Attention
        if self.cross:
            encoder_output = x['encoder_output']
            w = x['w']
            batch_size, length_seq, d_model = w.size()
            kv = self.kv_layer(w)
            q = self.q_layer(encoder_output)
            kv = kv.reshape(batch_size, length_seq, self.num_heads, 2 * self.head_dim)
            q = q.reshape(batch_size, length_seq, self.num_heads, self.head_dim)
            kv = kv.permute(0, 2, 1, 3)
            q = q.permute(0, 2, 1, 3)
            k, v = kv.chunk(2, dim=-1)
            values, attention = self.scaled_dot_product(q, k, v, mask) # mask is not required in Cross Attention
            values = values.permute(0, 2, 1, 3).reshape(batch_size, length_seq, self.num_heads * self.head_dim)
            out = self.linear_layer(values)
            return out

        # For MultiHead Attention
        batch_size, length_seq, d_model = x.size()
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size, length_seq, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        values, attention = self.scaled_dot_product(q, k, v, mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, length_seq, self.num_heads * self.head_dim)
        out = self.linear_layer(values)
        return out
    
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y  + self.beta
        return out

# Or using nn.LayerNorm(d_model)

class PositionwiseFeedForward(nn.Module):

    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

# feed_forward = nn.Sequential(
#     nn.Linear(d_model, expansion_factor * d_model),  # e.g: 512x(4*512) -> (512, 2048)
#     nn.ReLU(),  # ReLU activation function
#     nn.Linear(d_model * expansion_factor, d_model),  # e.g: 4*512)x512 -> (2048, 512)
# )

def replicate(block, N=6) -> nn.ModuleList:
    """
    Method to replicate the existing block to N set of blocks
    :param block: class inherited from nn.Module, mainly it is the encoder or decoder part of the architecture
    :param N: the number of stack, in the original paper they used 6
    :return: a set of N blocks
    """
    block_stack = nn.ModuleList([copy.deepcopy(block) for _ in range(N)])
    return block_stack

class Preprocessing(nn.Module):

    def __init__(self, max_length_seq, d_model, language_to_index, start_token, end_token, pad_token, dropout=0.1):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.language_to_index = language_to_index
        self.max_length_seq = max_length_seq
        self.start_token = start_token
        self.end_token = end_token
        self.pad_token = pad_token

        # Layer
        self.token_embedding = TokenEmbedding(self.vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_length_seq, dropout)
        self.dropout = nn.Dropout(dropout)

    
    
    def batch_tokens(self, batch, start_token:bool, end_token:bool):
        def tokenize(sentence, start_token:bool, end_token:bool):
            encode_char = [self.language_to_index[token] for token in list(sentence)]
            if start_token:
                encode_char.insert(0, self.language_to_index[self.start_token])
            if end_token:
                encode_char.append(self.language_to_index[self.end_token])
            for _ in range(len(encode_char), self.max_length_seq):
                encode_char.append(self.language_to_index[self.pad_token])
            return torch.tensor(encode_char)
        
        tokens = []
        for i in range(len(batch)):
            tokens.append(tokenize(batch[i], start_token, end_token))
        tokens = torch.stack(tokens)
        return tokens

    def forward(self, x, start_token:bool, end_token:bool): 
        x = self.batch_tokens(x, start_token, end_token)
        x = self.token_embedding(x.to(get_device()))
        pos = self.positional_encoding().to(get_device())
        x = self.dropout(x + pos)
        return x
    
class TransformerBlock(nn.Module):

    def __init__(self,
                 d_model=512,
                 num_heads=8,
                 ff_hidden=300,
                 dropout=0.1,
                 options='encoder'
                ):
        """
        The Transformer Block used in the encoder and decoder as well

        :param d_model: the embedding dimension
        :param num_heads: the number of heads
        :param ff_hidden: The output dimension of the feed forward layer
        :param dropout: probability dropout (between 0 and 1)
        :param options: The choice between 'encoder' and 'decoder'
        """
        super(TransformerBlock, self).__init__()
    
        self.options = options
        
        # For both 2 options: encoder and decoder
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm_for_attention = LayerNormalization(parameters_shape=[d_model])
        self.dropout_attention = nn.Dropout(dropout)

        
        
        # For decoder
        if self.options=='decoder':
            self.cross_attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads, cross=True)
            self.norm_for_cross_attention = LayerNormalization(parameters_shape=[d_model])
            self.dropout2 = nn.Dropout(dropout)
        elif self.options!='encoder':
            raise Exception(f"Unknown option {options}")

        # For both 2 options: encoder and decoder
        self.ff = PositionwiseFeedForward(d_model=d_model, hidden=ff_hidden, drop_prob=dropout)
        self.norm_for_ff = LayerNormalization(parameters_shape=[d_model])
        self.dropout_for_ff = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # For decoder
        if self.options == 'decoder':
            encoder_output = x['encoder_output']
            w = x['w']
            w_residual = w.clone()
            w = self.attention(w,mask['self_attention_mask'])
            w = self.dropout_attention(w)
            w = self.norm_for_attention(w + w_residual)

            w_residual = w.clone()
            w = self.cross_attention({'encoder_output':encoder_output,'w':w},mask['cross_attention_mask'])
            w = self.dropout2(w)
            w = self.norm_for_cross_attention(w + w_residual)

            w_residual = w.clone()
            w = self.ff(w)
            w = self.dropout_for_ff(w)
            w = self.norm_for_ff(w + w_residual)
            return w
        else:
        # For encoder
            x_residual = x.clone()
            x = self.attention(x, mask)
            x = self.dropout_attention(x)
            x = self.norm_for_attention(x + x_residual)

            x_residual = x.clone()
            x = self.ff(x)
            x = self.dropout_for_ff(x)
            x = self.norm_for_ff(x + x_residual)
            return x
        
class Encoder(nn.Module):

    def __init__(self,
                 d_model,
                 ff_hidden,
                 num_heads,
                 dropout,
                 num_blocks,
                 max_length_seq,
                 language_to_index,
                 start_token, 
                 end_token, 
                 pad_token
                ):
        """
        The Encoder part of the Transformer architecture
        """
        super().__init__()

        # Layer
        self.input_preprocessing = Preprocessing(max_length_seq, d_model, language_to_index, start_token, end_token, pad_token, dropout)
        
        # Transformer Blocks
        self.transformer_blocks = replicate(TransformerBlock(d_model, num_heads, ff_hidden, dropout, options="encoder"),num_blocks)

    def forward(self, x, self_attention_mask, start_token:bool, end_token:bool):
        # Input Pre-processing: Token Embedding + Positional Encoding
        out = self.input_preprocessing(x, start_token, end_token)

        # Go to Transformer Blocks (Encode)
        for block in self.transformer_blocks:
            out = block(out, self_attention_mask)

        return out
    
class Decoder(nn.Module):

    def __init__(self,
                 d_model,
                 ff_hidden,
                 num_heads,
                 dropout,
                 num_blocks,
                 max_length_seq,
                 language_to_index,
                 start_token, 
                 end_token, 
                 pad_token
                ):
        """
        The Decoder part of the Transformer architecture

        """
        super().__init__()
        
         # Layer
        self.output_preprocessing = Preprocessing(max_length_seq, d_model, language_to_index, start_token, end_token, pad_token, dropout)
        
        # Transformer Blocks
        self.transformer_blocks = replicate(TransformerBlock(d_model, num_heads, ff_hidden, dropout, options="decoder"),num_blocks)

    def forward(self, x, y, self_attention_mask, cross_attention_mask, start_token:bool, end_token:bool): 
        # x is output, y is output from encoder
        # Output Pre-processing: Token Embedding + Positional Encoding
        x = self.output_preprocessing(x, start_token, end_token)

        # Go to Transformer Blocks (Decode)
        encode_decode = {'encoder_output': y,'w':x}
        mask = {'self_attention_mask': self_attention_mask,'cross_attention_mask': cross_attention_mask}
        for block in self.transformer_blocks:
            encode_decode['w'] = x
            x = block(encode_decode, mask)
        return x
    
class Transformer(nn.Module):

    def __init__(self,
                 d_model,
                 ff_hidden,
                 num_heads,
                 dropout,
                 num_blocks,
                 max_length_seq,
                 language_to_index,
                 target_language_to_index,
                 start_token, 
                 end_token, 
                 pad_token
                ):
        super().__init__()

        # Device
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        
        # Encoder
        self.encoder = Encoder(
            d_model=d_model,
            ff_hidden=ff_hidden,
            num_heads=num_heads,
            dropout=dropout,
            num_blocks=num_blocks,
            max_length_seq=max_length_seq,
            language_to_index=language_to_index,
            start_token=start_token,
            end_token=end_token,
            pad_token=pad_token
        )

        # Decoder
        self.decoder = Decoder(
            d_model=d_model,
            ff_hidden=ff_hidden,
            num_heads=num_heads,
            dropout=dropout,
            num_blocks=num_blocks,
            max_length_seq=max_length_seq,
            language_to_index=target_language_to_index,
            start_token=start_token,
            end_token=end_token,
            pad_token=pad_token
        )

        # Linear Layer
        self.linear = nn.Linear(d_model, len(target_language_to_index))

        # Softmax
        

    def forward(self,
                x,
                y,
                encoder_self_attention_mask=None,
                decoder_self_attention_mask=None,
                decoder_cross_attention_mask=None,
                encoder_start_token=False,
                encoder_end_token=False,
                decoder_start_token=False,
                decoder_end_token=False):
        encoder_output = self.encoder(x, encoder_self_attention_mask, encoder_start_token, encoder_end_token)
        out = self.decoder(y, encoder_output, decoder_self_attention_mask, decoder_cross_attention_mask, decoder_start_token, decoder_end_token)
        out = self.linear(out)
        return out

# Transformer Model - Translator English to Vietnamese - Training

- Import library

In [2]:
import torch
import numpy as np
import warnings
import pandas as pd
import time
from torch.utils.data import Dataset, DataLoader
from torch import nn
from datasets import load_dataset
# from MyTransformer import Transformer, Masking

warnings.filterwarnings('ignore')

- Load Dataset from Hugging Face

In [3]:
dataset = load_dataset("kaitchup/opus-Vietnamese-to-English")
dataset

Downloading readme:   0%|          | 0.00/559 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/138k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/56.5M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/992248 [00:00<?, ? examples/s]

DatasetDict({
    validation: Dataset({
        features: ['text'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['text'],
        num_rows: 992248
    })
})

In [4]:
dataset['train']['text'][0:5]

['Cái gì đó? ###>What is it?',
 "Con nghĩ chúng ta nên đến mái ấm. ###>I thought we would go to the children's home.",
 'Có điều gì cô muốn nói với chồng mình không? ###>Is there something you want to tell your husband?',
 'Thầy của ngươi muốn săn chúng ta, thiêu chúng ta, ăn tim chúng ta. ###>Your master wants to hunt us, burn us, eat our hearts.',
 'Haylàkẻ yếuđuối? ###>Or too weak to see this through?']

In [5]:
sentences_train = list(map(lambda x: x.split('###>'), dataset['train']['text']))
vietnam_sentences_train = list(map(lambda x : x[0], sentences_train))
english_sentences_train = list(map(lambda x : x[1], sentences_train))
len(vietnam_sentences_train), len(english_sentences_train)

(992248, 992248)

In [6]:
sentences_valid = list(map(lambda x: x.split('###>'), dataset['validation']['text']))
vietnam_sentences_valid = list(map(lambda x : x[0], sentences_valid))
english_sentences_valid = list(map(lambda x : x[1], sentences_valid))
len(vietnam_sentences_valid), len(english_sentences_valid)

(2000, 2000)

In [7]:
vietnam_sentences_valid[0:4]

['Anh cũng làm việc cho hắn ta? ',
 'Xin lỡi, hôm nay tôi thấy khó chịu Tối qua tôi đã gặp ác mộng ',
 'Em không cho mụ vinh hạnh đó đâu. ',
 '- Bỏ nó vào túi. ']

In [8]:
english_sentences_valid[0:4]

['You can act as him, too?',
 "I'm sorry. I am nervous today. I had bad dreams.",
 "I wouldn't give her that pleasure. It's up to you.",
 '- Leave that in this bag.']

- Setup vocabulary

In [9]:
START_TOKEN = '<start>'
PADDING_TOKEN = '<pad>'
END_TOKEN = '<end>'

In [10]:
vietnamese_characters = [ ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', 
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'ˌ',
    'a', 'á', 'à', 'ả', 'ã', 'ạ', 'ă', 'ắ', 'ằ', 'ẳ', 'ẵ', 'ặ', 'â', 'ấ', 'ầ', 'ẩ', 'ẫ', 'ậ',
    'b', 'c', 'd', 'đ', 'e', 'é', 'è', 'ẻ', 'ẽ', 'ẹ', 'ê', 'ế', 'ề', 'ể', 'ễ', 'ệ', 
    'g', 'h', 'i', 'í', 'ì', 'ỉ', 'ĩ', 'ị', 'k', 'l', 'm', 'n', 'o', 'ó', 'ò', 'ỏ', 'õ', 'ọ', 
    'ô', 'ố', 'ồ', 'ổ', 'ỗ', 'ộ', 'ơ', 'ớ', 'ờ', 'ở', 'ỡ', 'ợ', 'p', 'q', 'r', 's', 't', 'u', 
    'ú', 'ù', 'ủ', 'ũ', 'ụ', 'ư', 'ứ', 'ừ', 'ử', 'ữ', 'ự', 'v', 'x', 'y', 'ý', 'ỳ', 'ỷ', 'ỹ', 'ỵ','z','w','f','j'
]

vietnamese_vocabulary = list(set([START_TOKEN] + vietnamese_characters + [char.upper() for char in vietnamese_characters] + [PADDING_TOKEN, END_TOKEN]))
len(vietnamese_vocabulary)

221

In [11]:
english_vocabulary = [ START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', 
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'ˌ',
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 
    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 
    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
                      PADDING_TOKEN, END_TOKEN
]
len(english_vocabulary)

87

- Check vocabulary

In [12]:
def Check_character(sentences,vocabulary):
    missing_character = []
    amount_sentences = 0
    for sentence in sentences:
        check = False
        for c in list(set(sentence)):
            if c not in vocabulary and c not in missing_character:
                missing_character.append(c)
                check = True
        if check:
            amount_sentences += 1
    if len(missing_character) == 0:
        print("Suitable vocabulary!")
        return None
    print(f"Find {missing_character} in vocabulary!")
    return amount_sentences

In [13]:
vietnam_wrong_sentences = Check_character(vietnam_sentences_train,vietnamese_vocabulary)
english_wrong_sentences = Check_character(english_sentences_train,english_vocabulary)

Find ['♫', '̀', '́', '̉', '♪', '̣', '^', '̃', '{', '\\', '}', '»', '«', '̀', '́', '£', '–', 'ð', ';', '@', '[', ']', 'Μ', '\xad', '¡', '°', '×', '§', '\x81', 'Æ', '\x99', '\x8b', '½', '´', '\xa0', 'ª', 'º', '³', 'Ð', '_', 'ß', 'Û', '王', '校', '長', '¶', 'Ü', '¢', '甩', '他', '的', '隊', '防', '開', '了', '員', '守', '歌', '影', '明', '以', '拍', '做', '可', '手', '星', '還', '電', 'ō', '是', '傑', '嘛', '阿', '對', '就', '們', '事', '不', '咱', 'ü', 'Ñ', '¹', 'γ', '’', 'Ë', 'ï', '≤', 'Ä', '\x91', 'ñ', '¯', 'ο', 'ë', 'ä', 'λ', 'ç', '「', '」', '©', 'Ç', '~', 'Þ', 'Η', '®', '合', '嗎', '跟', '照', '我', '¿', '那', '叫', '武', '在', '裡', '學', '功', '夫', '—', '，', '振', '李', '非', '格', '赵', '铎', 'ö', '吧', '噯', '沒', 'Ε', 'Τ', 'Α', '\x9f', 'ħ', 'ā', 'ī', '走', '·', '江', '湖', '永', '啊', '出', '退', 'і', 'ѕ', 'х', 'Ѕ', '有', '沖', '你', '快', '\u202d', '當', '生', '意', '敗', '年', '失', '蕭', '愛', '加', '嵐', '油', '¥', '系', '民', '派', '陈', '谈', '贤', '军', '阀', '国', '庆', 'å', 'µ', '拳', '來', '下', '神', '一', '都', '很', '天', '第', '原', '久', '前', '師', '父', '\x90',

Lots of characters like symbols, words in other languages. So we will try to remove all sentences which have unknown characters. If the amount of removed sentences are not so many, we can apply this. If so many sentences are removed, we should appy another ways like adding tag 'unknown'.

In [14]:
print(f'Train sentences: {len(vietnam_sentences_train)} (vietnam), {len(english_sentences_train)} (english)')
print(f'wrong train sentences: {vietnam_wrong_sentences} (vietnam), {english_wrong_sentences} (english)')

Train sentences: 992248 (vietnam), 992248 (english)
wrong train sentences: 277 (vietnam), 169 (english)


The number of removed sentences is much smaller than the total number of sentences so we can remove them.

In [15]:
def is_valid_sentence(sentence,vocabulary):
    for c in list(set(sentence)):
        if c not in vocabulary:
            return False
    return True

In [16]:
vn_temp = []
eng_temp = []
for i in range(0,len(vietnam_sentences_train)):
    if is_valid_sentence(vietnam_sentences_train[i], vietnamese_vocabulary) and is_valid_sentence(english_sentences_train[i], english_vocabulary):
        vn_temp.append(vietnam_sentences_train[i])
        eng_temp.append(english_sentences_train[i])
vietnam_sentences_train = vn_temp
english_sentences_train = eng_temp

vn_temp = []
eng_temp = []
for i in range(0,len(vietnam_sentences_valid)):
    if is_valid_sentence(vietnam_sentences_valid[i], vietnamese_vocabulary) and is_valid_sentence(english_sentences_valid[i], english_vocabulary):
        vn_temp.append(vietnam_sentences_valid[i])
        eng_temp.append(english_sentences_valid[i])
vietnam_sentences_valid = vn_temp
english_sentences_valid = eng_temp


# vietnam_wrong_sentences = Check_character(vietnam_sentences_train,vietnamese_vocabulary)
# english_wrong_sentences = Check_character(english_sentences_train,english_vocabulary)
# vietnam_wrong_sentences = Check_character(vietnam_sentences_valid,vietnamese_vocabulary)
# english_wrong_sentences = Check_character(english_sentences_valid,english_vocabulary)

In [17]:
index_to_vietnamese = {k:v for k,v in enumerate(vietnamese_vocabulary)}
vietnamese_to_index = {v:k for k,v in enumerate(vietnamese_vocabulary)}
index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}

- Check Length

In [18]:
df_train = pd.DataFrame({
    'vietnamese_train_length': [len(sentence) for sentence in vietnam_sentences_train],
    'english_train_length': [len(sentence) for sentence in english_sentences_train],
})

df_valid = pd.DataFrame({
    'vietnamese_valid_length': [len(sentence) for sentence in vietnam_sentences_valid],
    'english_valid_length': [len(sentence) for sentence in english_sentences_valid],
})

In [19]:
df_train.describe()

Unnamed: 0,vietnamese_train_length,english_train_length
count,952120.0,952120.0
mean,32.334417,30.988058
std,21.854748,22.082578
min,2.0,1.0
25%,17.0,15.0
50%,27.0,26.0
75%,42.0,40.0
max,274.0,416.0


In [20]:
df_valid.describe()

Unnamed: 0,vietnamese_valid_length,english_valid_length
count,1890.0,1890.0
mean,39.059788,39.02381
std,26.22471,26.75412
min,3.0,3.0
25%,22.0,22.0
50%,33.0,33.0
75%,49.0,49.0
max,190.0,188.0


In [21]:
print( f"{97}th percentile length English: {np.percentile(df_train['english_train_length'].tolist(), 97)}" )
print( f"{97}th percentile length Vietnam: {np.percentile(df_train['vietnamese_train_length'], 97)}" )

97th percentile length English: 86.0
97th percentile length Vietnam: 87.0


In [22]:
MAX_LENGTH = 100

In [23]:
vn_temp = []
eng_temp = []
for i in range(0,len(vietnam_sentences_train)):
    if len(vietnam_sentences_train[i]) < MAX_LENGTH - 1 and len(english_sentences_train[i]) < MAX_LENGTH - 1:
        vn_temp.append(vietnam_sentences_train[i])
        eng_temp.append(english_sentences_train[i])
vietnam_sentences_train = vn_temp
english_sentences_train = eng_temp

vn_temp = []
eng_temp = []
for i in range(0,len(vietnam_sentences_valid)):
    if len(vietnam_sentences_valid[i]) < MAX_LENGTH - 1 and len(english_sentences_valid[i]) < MAX_LENGTH - 1:
        vn_temp.append(vietnam_sentences_valid[i])
        eng_temp.append(english_sentences_valid[i])
vietnam_sentences_valid = vn_temp
english_sentences_valid = eng_temp

In [24]:
# Save data for another training (save time, for hugging face error)
import os

folder = 'data'
if not os.path.exists(folder):
    os.mkdir(folder)

with open("./data/vietnamese_train.txt", "w",encoding='utf-8') as file:
    for sentence in vietnam_sentences_train:
        file.write(f"{sentence}\n")
with open("./data/vietnamese_valid.txt", "w",encoding='utf-8') as file:
    for sentence in vietnam_sentences_valid:
        file.write(f"{sentence}\n")
with open("./data/english_train.txt", "w",encoding='utf-8') as file:
    for sentence in english_sentences_train:
        file.write(f"{sentence}\n")
with open("./data/english_valid.txt", "w",encoding='utf-8') as file:
    for sentence in english_sentences_valid:
        file.write(f"{sentence}\n")

- Setup DataLoader

In [25]:
class TextDataset(Dataset):

    def __init__(self, english_sentences, vietnam_sentences):
        self.english_sentences = english_sentences
        self.vietnam_sentences = vietnam_sentences

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        return self.english_sentences[idx], self.vietnam_sentences[idx]

In [26]:
data_train = TextDataset(english_sentences_train,vietnam_sentences_train)
print(len(data_train))
data_train[1]

929189


("I thought we would go to the children's home.",
 'Con nghĩ chúng ta nên đến mái ấm. ')

In [27]:
data_valid = TextDataset(english_sentences_valid,vietnam_sentences_valid)
print(len(data_valid))
data_valid[1]

1787


("I'm sorry. I am nervous today. I had bad dreams.",
 'Xin lỡi, hôm nay tôi thấy khó chịu Tối qua tôi đã gặp ác mộng ')

In [28]:
BATCH_SIZE = 30

train_loader = DataLoader(data_train, BATCH_SIZE)
valid_loader = DataLoader(data_valid, BATCH_SIZE)
iterator = iter(train_loader)

In [29]:
for batch_num, batch in enumerate(iterator):
    print(batch)
    if batch_num > 1:
        break

[('What is it?', "I thought we would go to the children's home.", 'Is there something you want to tell your husband?', 'Your master wants to hunt us, burn us, eat our hearts.', 'Or too weak to see this through?', 'OK.', 'The rest are kept in a secure glass viewing boot at the top of the tower.', "No, I haven't.", "Don't touch me.", 'Come in.', 'We\'re talking Falkor in "The Neverending Story."', "- He's an engineer.", 'Your job is finished.', 'Smart, Nikki.', "They're trying to save the life of a dear friend.", 'Yeah, a book doctor for Megan Vernoff.', "- I'll get the doctor up here.", 'Why dwell on it?', "You're a good lad.", 'Sure.', "I'm trying to protect your train.", "Hey, all I know is you're not safe here anymore.", "We mustn't mourn those who give their lives today.", "We're not murderers, in spite of what this undertaker says.", 'And her smile.', 'But still, there is a way for this deluded Queen to find some redemption.', 'Coulson: Did agent Ward give you anything?', "I'm sorr

- Setup model

In [30]:
model = Transformer(d_model=512,
                    ff_hidden=2048,
                    num_heads=8,
                    dropout=0.1,
                    num_blocks=1,
                    max_length_seq=MAX_LENGTH,
                    language_to_index=english_to_index,
                    target_language_to_index=vietnamese_to_index,
                    start_token=START_TOKEN,
                    end_token=END_TOKEN,
                    pad_token=PADDING_TOKEN
                   )

In [31]:
model

Transformer(
  (encoder): Encoder(
    (input_preprocessing): Preprocessing(
      (token_embedding): TokenEmbedding(
        (embedding_layer): Embedding(87, 512)
      )
      (positional_encoding): PositionalEncoding(
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer_blocks): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm_for_attention): LayerNormalization()
        (dropout_attention): Dropout(p=0.1, inplace=False)
        (ff): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (n

In [32]:
# Loss Function
criterian = nn.CrossEntropyLoss(ignore_index=vietnamese_to_index[PADDING_TOKEN],
                                reduction='none')

# Initialize weight
for params in model.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

# optimize
optim = torch.optim.Adam(model.parameters(), lr=1e-4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [33]:
# validation
def validation_translator(model, valid_dataloader):
    iterator = iter(valid_dataloader)
    valid_loss = []
    with torch.no_grad():
        for batch_num, batch in enumerate(iterator):
            model.eval()
            language_input = batch[0]
            language_output = batch[1]
            
            # Get mask
            encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = Masking(language_input, language_output, MAX_LENGTH)
    
            # Predict
            predictions = model(language_input,
                                language_output,
                                encoder_self_attention_mask,
                                decoder_self_attention_mask,
                                decoder_cross_attention_mask,
                                encoder_start_token=False,
                                encoder_end_token=False,
                                decoder_start_token=True,
                                decoder_end_token=True)
    
            # Loss
            Truelabels_tokens = model.decoder.output_preprocessing.batch_tokens(batch=language_output,start_token=False,end_token=True)
    
            loss = criterian(
                predictions.view(-1, len(vietnamese_to_index)),
                Truelabels_tokens.view(-1).to(device)
            ).to(device)
            ignore_pad = torch.where(Truelabels_tokens.view(-1) == vietnamese_to_index[PADDING_TOKEN], False, True)
            loss = loss.sum() / ignore_pad.sum()
            valid_loss.append(loss.item())
    return sum(valid_loss) / len(valid_loss)

- Training

In [34]:
model.train()
model.to(device)
loss_train = []
loss_valid = []
history = {}
epochs = 5

for epoch in range(1,epochs+1):
    print(f'Epoch {epoch} ' + '-' * (80 - len(str(epoch))))
    
    # Training
    start = time.time()
    count = 0
    per = 0
    iterator = iter(train_loader)
    length_iter = len(iterator)
    for batch_num, batch in enumerate(iterator):
        # Training mode
        model.train()
        # Reset Gradient from Backward Pass
        optim.zero_grad()

        # Get input/output to encoder/decoder
        language_input = batch[0]
        language_output = batch[1]
        # Get mask
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = Masking(language_input, language_output, MAX_LENGTH)

        # Predict
        predictions = model(language_input,
                            language_output,
                            encoder_self_attention_mask,
                            decoder_self_attention_mask,
                            decoder_cross_attention_mask,
                            encoder_start_token=False,
                            encoder_end_token=False,
                            decoder_start_token=True,
                            decoder_end_token=True)

        # Loss
        Truelabels_tokens = model.decoder.output_preprocessing.batch_tokens(batch=language_output,start_token=False,end_token=True)

        loss = criterian(
            predictions.view(-1, len(vietnamese_to_index)),
            Truelabels_tokens.view(-1).to(device)
        ).to(device)
        ignore_pad = torch.where(Truelabels_tokens.view(-1) == vietnamese_to_index[PADDING_TOKEN], False, True)
        loss = loss.sum() / ignore_pad.sum()

        # Backward and Optimize
        loss.backward()
        optim.step()

        # Each 10%, model will valid 
        if count == length_iter // 10:
            per += 1
            print(f"{per * 10}% Training Progress: time: {round(time.time() - start,2)} seconds - loss: {loss.item()}")
            start= time.time()
            print(f"- English Input: {language_input[0]}")
            print(f"- Vietnamese True Output: {language_output[0]}")
            # Get Sentence of predictions
            sentence_predict = ""
            for idx in torch.argmax(predictions[0], axis=1):
                id = int(idx)
                if id == vietnamese_to_index[END_TOKEN]:
                    break
                sentence_predict += index_to_vietnamese[id]
            print(f"- Vietnamese Predict: {sentence_predict}")
            valid_loss = validation_translator(model=model, valid_dataloader=valid_loader)
            print(f"- Validation loss: time: {round(time.time() - start,2)} seconds - loss: {valid_loss}",end="\n\n")
            # History
            loss_train.append(loss.item())
            loss_valid.append(valid_loss)
            count = 0
        count += 1   

Epoch 1 -------------------------------------------------------------------------------
10% Training Progress: time: 109.9 seconds - loss: 0.3875023424625397
- English Input: Here's a surveillance file from the DFS in Mexico.
- Vietnamese True Output: Đây là tất cả những gì chúng ta có được từ DFS Mexico. 
- Vietnamese Predict: Đây là tất cả những gì chúng ta có được từ D S Mexn    
- Validation loss: time: 1.32 seconds - loss: 0.2460223543147246

20% Training Progress: time: 109.81 seconds - loss: 0.35357794165611267
- English Input: Y-you know I'm not a terrorist.
- Vietnamese True Output: Cô biết tôi không phải là khủng bố! 
- Vietnamese Predict: Cô biết tôi không phải là khủng tàn 
- Validation loss: time: 1.2 seconds - loss: 0.22229043406744797

30% Training Progress: time: 109.72 seconds - loss: 0.25788751244544983
- English Input: Not till you tell me what the hell you're doing here!
- Vietnamese True Output: Không, đến khi nào con nói với bố con đang làm gì ở đây! 
- Vietnamese

- Build up a Translate Function

In [35]:
def Translate(input_setence):
    model.eval()
    input = (input_setence,)
    output = ("",)
    for index in range(MAX_LENGTH):
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = Masking(input, output, MAX_LENGTH)
        # Predict
        predictions = model(input,
                            output,
                            encoder_self_attention_mask,
                            decoder_self_attention_mask,
                            decoder_cross_attention_mask,
                            encoder_start_token=False,
                            encoder_end_token=False,
                            decoder_start_token=True,
                            decoder_end_token=False)
        next_token_distribution = predictions[0][index]
        next_token_index = torch.argmax(next_token_distribution).item()
        next_token = index_to_vietnamese[next_token_index]
        if next_token == END_TOKEN:
            break
        output = (output[0] + next_token,)
    return output[0]

In [37]:
Translate("Well, uh, how about dead or alive?")

'KK'

- Save model and the weight

In [None]:
torch.save(model, 'translator.pth')
torch.save(model.state_dict(), 'translator_weights.pth')