# Transformer Model - Translator English to Vietnamese - Training

- Import library

In [1]:
import torch
import numpy as np
import warnings
import pandas as pd
import time
from torch.utils.data import Dataset, DataLoader
from torch import nn
from datasets import load_dataset
from MyTransformer import Transformer, Masking

warnings.filterwarnings('ignore')

- Load Dataset from Hugging Face

In [2]:
dataset = load_dataset("kaitchup/opus-Vietnamese-to-English")
dataset

DatasetDict({
    validation: Dataset({
        features: ['text'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['text'],
        num_rows: 992248
    })
})

In [3]:
dataset['train']['text'][0:5]

['Cái gì đó? ###>What is it?',
 "Con nghĩ chúng ta nên đến mái ấm. ###>I thought we would go to the children's home.",
 'Có điều gì cô muốn nói với chồng mình không? ###>Is there something you want to tell your husband?',
 'Thầy của ngươi muốn săn chúng ta, thiêu chúng ta, ăn tim chúng ta. ###>Your master wants to hunt us, burn us, eat our hearts.',
 'Haylàkẻ yếuđuối? ###>Or too weak to see this through?']

In [4]:
sentences_train = list(map(lambda x: x.split('###>'), dataset['train']['text']))
vietnam_sentences_train = list(map(lambda x : x[0], sentences_train))
english_sentences_train = list(map(lambda x : x[1], sentences_train))
len(vietnam_sentences_train), len(english_sentences_train)

(992248, 992248)

In [5]:
sentences_valid = list(map(lambda x: x.split('###>'), dataset['validation']['text']))
vietnam_sentences_valid = list(map(lambda x : x[0], sentences_valid))
english_sentences_valid = list(map(lambda x : x[1], sentences_valid))
len(vietnam_sentences_valid), len(english_sentences_valid)

(2000, 2000)

In [6]:
vietnam_sentences_valid[0:4]

['Anh cũng làm việc cho hắn ta? ',
 'Xin lỡi, hôm nay tôi thấy khó chịu Tối qua tôi đã gặp ác mộng ',
 'Em không cho mụ vinh hạnh đó đâu. ',
 '- Bỏ nó vào túi. ']

In [7]:
english_sentences_valid[0:4]

['You can act as him, too?',
 "I'm sorry. I am nervous today. I had bad dreams.",
 "I wouldn't give her that pleasure. It's up to you.",
 '- Leave that in this bag.']

- Setup vocabulary

In [8]:
START_TOKEN = '<start>'
PADDING_TOKEN = '<pad>'
END_TOKEN = '<end>'

In [9]:
vietnamese_characters = [ ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', 
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'ˌ',
    'a', 'á', 'à', 'ả', 'ã', 'ạ', 'ă', 'ắ', 'ằ', 'ẳ', 'ẵ', 'ặ', 'â', 'ấ', 'ầ', 'ẩ', 'ẫ', 'ậ',
    'b', 'c', 'd', 'đ', 'e', 'é', 'è', 'ẻ', 'ẽ', 'ẹ', 'ê', 'ế', 'ề', 'ể', 'ễ', 'ệ', 
    'g', 'h', 'i', 'í', 'ì', 'ỉ', 'ĩ', 'ị', 'k', 'l', 'm', 'n', 'o', 'ó', 'ò', 'ỏ', 'õ', 'ọ', 
    'ô', 'ố', 'ồ', 'ổ', 'ỗ', 'ộ', 'ơ', 'ớ', 'ờ', 'ở', 'ỡ', 'ợ', 'p', 'q', 'r', 's', 't', 'u', 
    'ú', 'ù', 'ủ', 'ũ', 'ụ', 'ư', 'ứ', 'ừ', 'ử', 'ữ', 'ự', 'v', 'x', 'y', 'ý', 'ỳ', 'ỷ', 'ỹ', 'ỵ','z','w','f','j'
]

vietnamese_vocabulary = list(set([START_TOKEN] + vietnamese_characters + [char.upper() for char in vietnamese_characters] + [PADDING_TOKEN, END_TOKEN]))
len(vietnamese_vocabulary)

221

In [10]:
english_vocabulary = [ START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', 
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'ˌ',
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 
    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 
    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
                      PADDING_TOKEN, END_TOKEN
]
len(english_vocabulary)

87

- Check vocabulary

In [11]:
def Check_character(sentences,vocabulary):
    missing_character = []
    amount_sentences = 0
    for sentence in sentences:
        check = False
        for c in list(set(sentence)):
            if c not in vocabulary and c not in missing_character:
                missing_character.append(c)
                check = True
        if check:
            amount_sentences += 1
    if len(missing_character) == 0:
        print("Suitable vocabulary!")
        return None
    print(f"Find {missing_character} in vocabulary!")
    return amount_sentences

In [12]:
vietnam_wrong_sentences = Check_character(vietnam_sentences_train,vietnamese_vocabulary)
english_wrong_sentences = Check_character(english_sentences_train,english_vocabulary)

Find ['♫', '̀', '́', '̉', '♪', '̣', '^', '̃', '\\', '}', '{', '»', '«', '̀', '́', '£', '–', 'ð', ';', '@', ']', '[', 'Μ', '°', '×', '\xad', '¡', 'ª', '§', '\x99', '\x81', '\xa0', '\x8b', '³', '´', 'Æ', 'º', '½', 'Ð', '_', 'Û', 'ß', '王', '校', '長', '¶', 'Ü', '¢', '甩', '了', '他', '防', '員', '的', '守', '開', '隊', '星', '影', '電', '可', '明', '做', '手', '以', '歌', '還', '拍', 'ō', '傑', '們', '就', '是', '阿', '咱', '對', '事', '不', '嘛', 'ü', '¹', 'Ñ', 'γ', '’', 'Ë', 'ï', '≤', '\x91', 'Ä', 'ñ', '¯', 'ο', 'ë', 'ä', 'λ', 'ç', '」', '「', '©', 'Ç', '~', 'Þ', 'Η', '®', '合', '嗎', '跟', '我', '照', '¿', '武', '叫', '裡', '學', '夫', '那', '功', '在', '—', '，', '赵', '非', '格', '铎', '李', '振', 'ö', '噯', '沒', '吧', 'Τ', 'Ε', 'Α', '\x9f', 'ī', 'ā', 'ħ', '走', '·', '永', '退', '江', '出', '湖', '啊', 'Ѕ', 'ѕ', 'і', 'х', '沖', '有', '快', '你', '\u202d', '年', '當', '敗', '生', '意', '失', '加', '嵐', '蕭', '愛', '油', '¥', '阀', '谈', '国', '民', '贤', '庆', '派', '陈', '军', '系', 'å', 'µ', '第', '下', '拳', '很', '前', '神', '一', '天', '都', '久', '來', '原', '師', '父', '\x87',

Lots of characters like symbols, words in other languages. So we will try to remove all sentences which have unknown characters. If the amount of removed sentences are not so many, we can apply this. If so many sentences are removed, we should appy another ways like adding tag 'unknown'.

In [13]:
print(f'Train sentences: {len(vietnam_sentences_train)} (vietnam), {len(english_sentences_train)} (english)')
print(f'wrong train sentences: {vietnam_wrong_sentences} (vietnam), {english_wrong_sentences} (english)')

Train sentences: 992248 (vietnam), 992248 (english)
wrong train sentences: 277 (vietnam), 169 (english)


The number of removed sentences is much smaller than the total number of sentences so we can remove them.

In [14]:
def is_valid_sentence(sentence,vocabulary):
    for c in list(set(sentence)):
        if c not in vocabulary:
            return False
    return True

In [15]:
vn_temp = []
eng_temp = []
for i in range(0,len(vietnam_sentences_train)):
    if is_valid_sentence(vietnam_sentences_train[i], vietnamese_vocabulary) and is_valid_sentence(english_sentences_train[i], english_vocabulary):
        vn_temp.append(vietnam_sentences_train[i])
        eng_temp.append(english_sentences_train[i])
vietnam_sentences_train = vn_temp
english_sentences_train = eng_temp

vn_temp = []
eng_temp = []
for i in range(0,len(vietnam_sentences_valid)):
    if is_valid_sentence(vietnam_sentences_valid[i], vietnamese_vocabulary) and is_valid_sentence(english_sentences_valid[i], english_vocabulary):
        vn_temp.append(vietnam_sentences_valid[i])
        eng_temp.append(english_sentences_valid[i])
vietnam_sentences_valid = vn_temp
english_sentences_valid = eng_temp


# vietnam_wrong_sentences = Check_character(vietnam_sentences_train,vietnamese_vocabulary)
# english_wrong_sentences = Check_character(english_sentences_train,english_vocabulary)
# vietnam_wrong_sentences = Check_character(vietnam_sentences_valid,vietnamese_vocabulary)
# english_wrong_sentences = Check_character(english_sentences_valid,english_vocabulary)

In [16]:
index_to_vietnamese = {k:v for k,v in enumerate(vietnamese_vocabulary)}
vietnamese_to_index = {v:k for k,v in enumerate(vietnamese_vocabulary)}
index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}

- Check Length

In [17]:
df_train = pd.DataFrame({
    'vietnamese_train_length': [len(sentence) for sentence in vietnam_sentences_train],
    'english_train_length': [len(sentence) for sentence in english_sentences_train],
})

df_valid = pd.DataFrame({
    'vietnamese_valid_length': [len(sentence) for sentence in vietnam_sentences_valid],
    'english_valid_length': [len(sentence) for sentence in english_sentences_valid],
})

In [18]:
df_train.describe()

Unnamed: 0,vietnamese_train_length,english_train_length
count,952120.0,952120.0
mean,32.334417,30.988058
std,21.854748,22.082578
min,2.0,1.0
25%,17.0,15.0
50%,27.0,26.0
75%,42.0,40.0
max,274.0,416.0


In [19]:
df_valid.describe()

Unnamed: 0,vietnamese_valid_length,english_valid_length
count,1890.0,1890.0
mean,39.059788,39.02381
std,26.22471,26.75412
min,3.0,3.0
25%,22.0,22.0
50%,33.0,33.0
75%,49.0,49.0
max,190.0,188.0


In [20]:
print( f"{97}th percentile length English: {np.percentile(df_train['english_train_length'].tolist(), 97)}" )
print( f"{97}th percentile length Vietnam: {np.percentile(df_train['vietnamese_train_length'], 97)}" )

97th percentile length English: 86.0
97th percentile length Vietnam: 87.0


In [21]:
MAX_LENGTH = 100

In [22]:
vn_temp = []
eng_temp = []
for i in range(0,len(vietnam_sentences_train)):
    if len(vietnam_sentences_train[i]) < MAX_LENGTH - 1 and len(english_sentences_train[i]) < MAX_LENGTH - 1:
        vn_temp.append(vietnam_sentences_train[i])
        eng_temp.append(english_sentences_train[i])
vietnam_sentences_train = vn_temp
english_sentences_train = eng_temp

vn_temp = []
eng_temp = []
for i in range(0,len(vietnam_sentences_valid)):
    if len(vietnam_sentences_valid[i]) < MAX_LENGTH - 1 and len(english_sentences_valid[i]) < MAX_LENGTH - 1:
        vn_temp.append(vietnam_sentences_valid[i])
        eng_temp.append(english_sentences_valid[i])
vietnam_sentences_valid = vn_temp
english_sentences_valid = eng_temp

In [23]:
# Save data for another training (save time, for hugging face error)
import os

folder = 'data'
if not os.path.exists(folder):
    os.mkdir(folder)

with open("./data/vietnamese_train.txt", "w",encoding='utf-8') as file:
    for sentence in vietnam_sentences_train:
        file.write(f"{sentence}\n")
with open("./data/vietnamese_valid.txt", "w",encoding='utf-8') as file:
    for sentence in vietnam_sentences_valid:
        file.write(f"{sentence}\n")
with open("./data/english_train.txt", "w",encoding='utf-8') as file:
    for sentence in english_sentences_train:
        file.write(f"{sentence}\n")
with open("./data/english_valid.txt", "w",encoding='utf-8') as file:
    for sentence in english_sentences_valid:
        file.write(f"{sentence}\n")

- Setup DataLoader

In [88]:
class TextDataset(Dataset):

    def __init__(self, english_sentences, vietnam_sentences):
        self.english_sentences = english_sentences
        self.vietnam_sentences = vietnam_sentences

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        return self.english_sentences[idx], self.vietnam_sentences[idx]

In [89]:
data_train = TextDataset(english_sentences_train[:10000],vietnam_sentences_train[:10000])
print(len(data_train))
data_train[1]

10000


("I thought we would go to the children's home.",
 'Con nghĩ chúng ta nên đến mái ấm. ')

In [90]:
data_valid = TextDataset(english_sentences_valid,vietnam_sentences_valid)
print(len(data_valid))
data_valid[1]

1787


("I'm sorry. I am nervous today. I had bad dreams.",
 'Xin lỡi, hôm nay tôi thấy khó chịu Tối qua tôi đã gặp ác mộng ')

In [91]:
BATCH_SIZE = 30

train_loader = DataLoader(data_train, BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(data_valid, BATCH_SIZE)
iterator = iter(train_loader)

In [92]:
for batch_num, batch in enumerate(iterator):
    print(batch)
    if batch_num > 1:
        break

[('Randa!', "- Nothing, really, I'm--", '- We liberated it.', "It's no good.", "That's the coolest thing that ever happened to me.", 'And credit card statements.', 'Looks like she survived the procedure.', 'How do you know what kind of goddamn day it is?', "You've got...", 'I have never done that before.', 'Awesome.', "I'm here!", 'Am I right?', 'Please stop.', 'Hi.', 'Everything in this world is magic, except to the magician.', 'Kingpin.', 'Even if your intentions are good, it can backfire drastically.', 'That bugger moved so fast,', 'More supporting evidence?', "I'm sitting right here.", 'Can I have your card?', "I'll see you later.", '- Who are you?', "Doesn't matter what will trigger of the bombs.", 'Dancing.', '- Yeah. You look after our place for me, all right?', '- Reddick have to call in sick?', "Ace, let's party!", "You're correct."), ('Randa! ', '- Thật ra thì không có gì. ', '- Nó đã được giải phóng. ', 'Không được. ', 'Đây là chuyện tuyệt với nhất từng xảy ra với tôi. ', 'V

- Setup model

In [93]:
model = Transformer(d_model=512,
                    ff_hidden=2048,
                    num_heads=8,
                    dropout=0.1,
                    num_blocks=1,
                    max_length_seq=MAX_LENGTH,
                    language_to_index=english_to_index,
                    target_language_to_index=vietnamese_to_index,
                    start_token=START_TOKEN,
                    end_token=END_TOKEN,
                    pad_token=PADDING_TOKEN
                   )

In [94]:
model

Transformer(
  (encoder): Encoder(
    (input_preprocessing): Preprocessing(
      (token_embedding): TokenEmbedding(
        (embedding_layer): Embedding(87, 512)
      )
      (positional_encoding): PositionalEncoding(
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer_blocks): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm_for_attention): LayerNormalization()
        (dropout_attention): Dropout(p=0.1, inplace=False)
        (ff): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (n

In [95]:
# Loss Function
criterian = nn.CrossEntropyLoss(ignore_index=vietnamese_to_index[PADDING_TOKEN],
                                reduction='none')

# Initialize weight
for params in model.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

# optimize
optim = torch.optim.Adam(model.parameters(), lr=1e-4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [96]:
# validation
def validation_translator(model, valid_dataloader):
    iterator = iter(valid_dataloader)
    valid_loss = []
    with torch.no_grad():
        for batch_num, batch in enumerate(iterator):
            model.eval()
            language_input = batch[0]
            language_output = batch[1]
            
            # Get mask
            encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = Masking(language_input, language_output, MAX_LENGTH)
    
            # Predict
            predictions = model(language_input,
                                language_output,
                                encoder_self_attention_mask,
                                decoder_self_attention_mask,
                                decoder_cross_attention_mask,
                                encoder_start_token=False,
                                encoder_end_token=False,
                                decoder_start_token=True,
                                decoder_end_token=True)
    
            # Loss
            Truelabels_tokens = model.decoder.output_preprocessing.batch_tokens(batch=language_output,start_token=False,end_token=True)
    
            loss = criterian(
                predictions.view(-1, len(vietnamese_to_index)),
                Truelabels_tokens.view(-1).to(device)
            ).to(device)
            ignore_pad = torch.where(Truelabels_tokens.view(-1) == vietnamese_to_index[PADDING_TOKEN], False, True)
            loss = loss.sum() / ignore_pad.sum()
            valid_loss.append(loss.item())
    return sum(valid_loss) / len(valid_loss)

- Training

In [98]:
model.train()
model.to(device)
loss_train = []
loss_valid = []
history = {}
epochs = 5

for epoch in range(1,epochs+1):
    print(f'Epoch {epoch} ' + '-' * (80 - len(str(epoch))))
    
    # Training
    start = time.time()
    count = 0
    per = 0
    iterator = iter(train_loader)
    length_iter = len(iterator)
    for batch_num, batch in enumerate(iterator):
        # Training mode
        model.train()
        # Reset Gradient from Backward Pass
        optim.zero_grad()

        # Get input/output to encoder/decoder
        language_input = batch[0]
        language_output = batch[1]
        # Get mask
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = Masking(language_input, language_output, MAX_LENGTH)

        # Predict
        predictions = model(language_input,
                            language_output,
                            encoder_self_attention_mask,
                            decoder_self_attention_mask,
                            decoder_cross_attention_mask,
                            encoder_start_token=False,
                            encoder_end_token=False,
                            decoder_start_token=True,
                            decoder_end_token=True)

        # Loss
        Truelabels_tokens = model.decoder.output_preprocessing.batch_tokens(batch=language_output,start_token=False,end_token=True)

        loss = criterian(
            predictions.view(-1, len(vietnamese_to_index)),
            Truelabels_tokens.view(-1).to(device)
        ).to(device)
        ignore_pad = torch.where(Truelabels_tokens.view(-1) == vietnamese_to_index[PADDING_TOKEN], False, True)
        loss = loss.sum() / ignore_pad.sum()

        # Backward and Optimize
        loss.backward()
        optim.step()

        # Each 10%, model will valid 
        if count == length_iter // 10:
            per += 1
            print(f"{per * 10}% Training Progress: time: {round(time.time() - start,2)} seconds - loss: {loss.item()}")
            start= time.time()
            print(f"- English Input: {language_input[0]}")
            print(f"- Vietnamese True Output: {language_output[0]}")
            # Get Sentence of predictions
            sentence_predict = ""
            for idx in torch.argmax(predictions[0], axis=1):
                id = int(idx)
                if id == vietnamese_to_index[END_TOKEN]:
                    break
                sentence_predict += index_to_vietnamese[id]
            print(f"- Vietnamese Predict: {sentence_predict}")
            # valid_loss = validation_translator(model=model, valid_dataloader=valid_loader)
            # print(f"- Validation loss: time: {round(time.time() - start,2)} seconds - loss: {valid_loss}",end="\n\n")
            # History
            loss_train.append(loss.item())
            # loss_valid.append(valid_loss)
            count = 0
        count += 1   

Epoch 1 -------------------------------------------------------------------------------
10% Training Progress: time: 18.41 seconds - loss: 3.0740585327148438
- English Input: I, who lived at that time, would know better.
- Vietnamese True Output: Tình cảnh lúc đó chỉ có ta là rõ nhất. 
- Vietnamese Predict: Cô   c    chn ci c i c  n  tn ccn
20% Training Progress: time: 16.9 seconds - loss: 2.970031499862671
- English Input: - Mom.
- Vietnamese True Output: - Mẹ. 
- Vietnamese Predict: T h   
30% Training Progress: time: 17.11 seconds - loss: 2.693114995956421
- English Input: Don't go anywhere.
- Vietnamese True Output: Đừng đi đâu cả. 
- Vietnamese Predict: Cn g ci nnn đh  
40% Training Progress: time: 17.79 seconds - loss: 2.8159708976745605
- English Input: This reminds me of home when I was a kid
- Vietnamese True Output: ở đây làm cho tôi nhớ đến gia đình hồi tôi còn nhỏ 
- Vietnamese Predict: C ti  tn  t   thi c   m n n   t    t   
50% Training Progress: time: 17.04 seconds - los

- Build up a Translate Function

In [116]:
def Translate(input_setence):
    model.eval()
    input = (input_setence,)
    output = ("",)
    for index in range(MAX_LENGTH):
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = Masking(input, output, MAX_LENGTH)
        # Predict
        predictions = model(input,
                            output,
                            encoder_self_attention_mask,
                            decoder_self_attention_mask,
                            decoder_cross_attention_mask,
                            encoder_start_token=False,
                            encoder_end_token=False,
                            decoder_start_token=True,
                            decoder_end_token=False)
        next_token_distribution = predictions[0][index]
        next_token_index = torch.argmax(next_token_distribution)
        next_token = index_to_vietnamese[int(next_token_index)]
        if next_token == END_TOKEN:
            break
        output = (output[0] + next_token,)
    return output[0]

In [125]:
Translate("Hey Bro pleassssase help!")

'OOOOOOO'

- Save model and the weight

In [126]:
torch.save(model, 'translator.pth')
torch.save(model.state_dict(), 'translator_weights.pth')