# Transformer Model - Translator English to Vietnamese - Training

- Import library

In [1]:
import torch
import numpy as np
import warnings
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch import nn
from datasets import load_dataset
from MyTransformer import Transformer

warnings.filterwarnings('ignore')

CUDA is available. PyTorch is using GPU.
Number of GPUs available:  1
GPU name:  NVIDIA GeForce GTX 1650


- Load Dataset from Hugging Face

In [4]:
dataset = load_dataset("kaitchup/opus-Vietnamese-to-English")
dataset

DatasetDict({
    validation: Dataset({
        features: ['text'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['text'],
        num_rows: 992248
    })
})

In [5]:
dataset['train']['text'][0:5]

['Cái gì đó? ###>What is it?',
 "Con nghĩ chúng ta nên đến mái ấm. ###>I thought we would go to the children's home.",
 'Có điều gì cô muốn nói với chồng mình không? ###>Is there something you want to tell your husband?',
 'Thầy của ngươi muốn săn chúng ta, thiêu chúng ta, ăn tim chúng ta. ###>Your master wants to hunt us, burn us, eat our hearts.',
 'Haylàkẻ yếuđuối? ###>Or too weak to see this through?']

In [6]:
sentences_train = list(map(lambda x: x.split('###>'), dataset['train']['text']))
vietnam_sentences_train = list(map(lambda x : x[0], sentences_train))
english_sentences_train = list(map(lambda x : x[1], sentences_train))
len(vietnam_sentences_train), len(english_sentences_train)

(992248, 992248)

In [7]:
sentences_valid = list(map(lambda x: x.split('###>'), dataset['validation']['text']))
vietnam_sentences_valid = list(map(lambda x : x[0], sentences_valid))
english_sentences_valid = list(map(lambda x : x[1], sentences_valid))
len(vietnam_sentences_valid), len(english_sentences_valid)

(2000, 2000)

In [8]:
vietnam_sentences_valid[0:4]

['Anh cũng làm việc cho hắn ta? ',
 'Xin lỡi, hôm nay tôi thấy khó chịu Tối qua tôi đã gặp ác mộng ',
 'Em không cho mụ vinh hạnh đó đâu. ',
 '- Bỏ nó vào túi. ']

In [9]:
english_sentences_valid[0:4]

['You can act as him, too?',
 "I'm sorry. I am nervous today. I had bad dreams.",
 "I wouldn't give her that pleasure. It's up to you.",
 '- Leave that in this bag.']

- Setup vocabulary

In [10]:
START_TOKEN = '<start>'
PADDING_TOKEN = '<pad>'
END_TOKEN = '<end>'

In [11]:
vietnamese_characters = [ ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', 
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'ˌ',
    'a', 'á', 'à', 'ả', 'ã', 'ạ', 'ă', 'ắ', 'ằ', 'ẳ', 'ẵ', 'ặ', 'â', 'ấ', 'ầ', 'ẩ', 'ẫ', 'ậ',
    'b', 'c', 'd', 'đ', 'e', 'é', 'è', 'ẻ', 'ẽ', 'ẹ', 'ê', 'ế', 'ề', 'ể', 'ễ', 'ệ', 
    'g', 'h', 'i', 'í', 'ì', 'ỉ', 'ĩ', 'ị', 'k', 'l', 'm', 'n', 'o', 'ó', 'ò', 'ỏ', 'õ', 'ọ', 
    'ô', 'ố', 'ồ', 'ổ', 'ỗ', 'ộ', 'ơ', 'ớ', 'ờ', 'ở', 'ỡ', 'ợ', 'p', 'q', 'r', 's', 't', 'u', 
    'ú', 'ù', 'ủ', 'ũ', 'ụ', 'ư', 'ứ', 'ừ', 'ử', 'ữ', 'ự', 'v', 'x', 'y', 'ý', 'ỳ', 'ỷ', 'ỹ', 'ỵ','z','w','f','j'
]

vietnamese_vocabulary = [START_TOKEN] + vietnamese_characters + [char.upper() for char in vietnamese_characters] + [PADDING_TOKEN, END_TOKEN]
len(vietnamese_vocabulary)

253

In [12]:
english_vocabulary = [ START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', 
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'ˌ',
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 
    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 
    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
                      PADDING_TOKEN, END_TOKEN
]
len(english_vocabulary)

87

- Check vocabulary

In [13]:
def Check_character(sentences,vocabulary):
    missing_character = []
    amount_sentences = 0
    for sentence in sentences:
        check = False
        for c in list(set(sentence)):
            if c not in vocabulary and c not in missing_character:
                missing_character.append(c)
                check = True
        if check:
            amount_sentences += 1
    if len(missing_character) == 0:
        print("Suitable vocabulary!")
        return None
    print(f"Find {missing_character} in vocabulary!")
    return amount_sentences

In [12]:
vietnam_wrong_sentences = Check_character(vietnam_sentences_train,vietnamese_vocabulary)
english_wrong_sentences = Check_character(english_sentences_train,english_vocabulary)

Find ['♫', '̀', '́', '̉', '♪', '̣', '^', '̃', '}', '\\', '{', '«', '»', '́', '̀', '£', '–', 'ð', ';', '@', '[', ']', 'Μ', '\xad', '°', '¡', '×', '³', '§', '\x8b', '\x81', '´', 'ª', 'º', '\x99', 'Æ', '\xa0', '½', 'Ð', '_', 'Û', 'ß', '校', '王', '長', '¶', '¢', 'Ü', '隊', '他', '甩', '守', '開', '防', '員', '的', '了', '手', '電', '以', '還', '拍', '影', '星', '可', '做', '歌', '明', 'ō', '嘛', '就', '是', '不', '們', '對', '傑', '咱', '事', '阿', 'ü', '¹', 'Ñ', 'γ', '’', 'Ë', 'ï', '≤', 'Ä', '\x91', 'ñ', '¯', 'ο', 'ë', 'ä', 'λ', 'ç', '」', '「', '©', 'Ç', '~', 'Þ', 'Η', '®', '合', '照', '我', '跟', '嗎', '¿', '叫', '裡', '在', '夫', '學', '武', '功', '那', '—', '，', '振', '非', '李', '格', '赵', '铎', 'ö', '吧', '噯', '沒', 'Α', 'Ε', 'Τ', '\x9f', 'ī', 'ħ', 'ā', '走', '·', '永', '江', '湖', '退', '啊', '出', 'Ѕ', 'ѕ', 'х', 'і', '你', '有', '沖', '快', '\u202d', '生', '年', '敗', '失', '當', '意', '加', '嵐', '油', '愛', '蕭', '¥', '国', '军', '庆', '谈', '贤', '阀', '陈', '民', '派', '系', 'å', 'µ', '天', '久', '原', '神', '拳', '一', '下', '都', '來', '第', '前', '很', '父', '師', '\x87',

Lots of characters like symbols, words in other languages. So we will try to remove all sentences which have unknown characters. If the amount of removed sentences are not so many, we can apply this. If so many sentences are removed, we should appy another ways like adding tag 'unknown'.

In [12]:
print(f'Train sentences: {len(vietnam_sentences_train)} (vietnam), {len(english_sentences_train)} (english)')
print(f'wrong train sentences: {vietnam_wrong_sentences} (vietnam), {english_wrong_sentences} (english)')

Train sentences: 992248 (vietnam), 992248 (english)


NameError: name 'vietnam_wrong_sentences' is not defined

The number of removed sentences is much smaller than the total number of sentences so we can remove them.

In [14]:
def is_valid_sentence(sentence,vocabulary):
    for c in list(set(sentence)):
        if c not in vocabulary:
            return False
    return True

In [15]:
vn_temp = []
eng_temp = []
for i in range(0,len(vietnam_sentences_train)):
    if is_valid_sentence(vietnam_sentences_train[i], vietnamese_vocabulary) and is_valid_sentence(english_sentences_train[i], english_vocabulary):
        vn_temp.append(vietnam_sentences_train[i])
        eng_temp.append(english_sentences_train[i])
vietnam_sentences_train = vn_temp
english_sentences_train = eng_temp

vn_temp = []
eng_temp = []
for i in range(0,len(vietnam_sentences_valid)):
    if is_valid_sentence(vietnam_sentences_valid[i], vietnamese_vocabulary) and is_valid_sentence(english_sentences_valid[i], english_vocabulary):
        vn_temp.append(vietnam_sentences_valid[i])
        eng_temp.append(english_sentences_valid[i])
vietnam_sentences_valid = vn_temp
english_sentences_valid = eng_temp


# vietnam_wrong_sentences = Check_character(vietnam_sentences_train,vietnamese_vocabulary)
# english_wrong_sentences = Check_character(english_sentences_train,english_vocabulary)
# vietnam_wrong_sentences = Check_character(vietnam_sentences_valid,vietnamese_vocabulary)
# english_wrong_sentences = Check_character(english_sentences_valid,english_vocabulary)

In [16]:
index_to_vietnamese = {k:v for k,v in enumerate(vietnamese_vocabulary)}
vietnamese_to_index = {v:k for k,v in enumerate(vietnamese_vocabulary)}
index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}

- Check Length

In [17]:
df_train = pd.DataFrame({
    'vietnamese_train_length': [len(sentence) for sentence in vietnam_sentences_train],
    'english_train_length': [len(sentence) for sentence in english_sentences_train],
})

df_valid = pd.DataFrame({
    'vietnamese_valid_length': [len(sentence) for sentence in vietnam_sentences_valid],
    'english_valid_length': [len(sentence) for sentence in english_sentences_valid],
})

In [18]:
df_train.describe()

Unnamed: 0,vietnamese_train_length,english_train_length
count,952120.0,952120.0
mean,32.334417,30.988058
std,21.854748,22.082578
min,2.0,1.0
25%,17.0,15.0
50%,27.0,26.0
75%,42.0,40.0
max,274.0,416.0


In [19]:
df_valid.describe()

Unnamed: 0,vietnamese_valid_length,english_valid_length
count,1890.0,1890.0
mean,39.059788,39.02381
std,26.22471,26.75412
min,3.0,3.0
25%,22.0,22.0
50%,33.0,33.0
75%,49.0,49.0
max,190.0,188.0


In [20]:
print( f"{97}th percentile length English: {np.percentile(df_train['english_train_length'].tolist(), 97)}" )
print( f"{97}th percentile length Vietnam: {np.percentile(df_train['vietnamese_train_length'], 97)}" )

97th percentile length English: 86.0
97th percentile length Vietnam: 87.0


In [21]:
MAX_LENGTH = 100

In [22]:
vn_temp = []
eng_temp = []
for i in range(0,len(vietnam_sentences_train)):
    if len(vietnam_sentences_train[i]) <= MAX_LENGTH and len(english_sentences_train[i]) <= MAX_LENGTH:
        vn_temp.append(vietnam_sentences_train[i])
        eng_temp.append(english_sentences_train[i])
vietnam_sentences_train = vn_temp
english_sentences_train = eng_temp

vn_temp = []
eng_temp = []
for i in range(0,len(vietnam_sentences_valid)):
    if len(vietnam_sentences_valid[i]) <= MAX_LENGTH and len(english_sentences_valid[i]) <= MAX_LENGTH:
        vn_temp.append(vietnam_sentences_valid[i])
        eng_temp.append(english_sentences_valid[i])
vietnam_sentences_valid = vn_temp
english_sentences_valid = eng_temp

In [23]:
# Save data for another training (save time, for hugging face error)
import os

folder = 'data'
if not os.path.exists(folder):
    os.mkdir(folder)

with open("./data/vietnamese_train.txt", "w",encoding='utf-8') as file:
    for sentence in vietnam_sentences_train:
        file.write(f"{sentence}\n")
with open("./data/vietnamese_valid.txt", "w",encoding='utf-8') as file:
    for sentence in vietnam_sentences_valid:
        file.write(f"{sentence}\n")
with open("./data/english_train.txt", "w",encoding='utf-8') as file:
    for sentence in english_sentences_train:
        file.write(f"{sentence}\n")
with open("./data/english_valid.txt", "w",encoding='utf-8') as file:
    for sentence in english_sentences_valid:
        file.write(f"{sentence}\n")

- Setup DataLoader

In [25]:
def sentence2tokens(sentence:str):
    return [c for c in sentence]

def encode_tokens(tokens, vocab):
    encode_tokens = [vocab[x] for x in tokens]
    return encode_tokens

def preprocessing(sentences, vocab, max_length_seq, pad_char,start_char=None, end_char=None):
    result = []
    for sentence in sentences:
        if start_char != None:
            tokens = [vocab[start_char]] + tokens
        if end_char != None:
            tokens = tokens + [vocab[end_char]]
        for i in range(len(tokens),max_length_seq):
            tokens = tokens + [vocab[pad_char]]
        num_pads = max_length_seq - len(sentence)
        result.append(tokens)
    return torch.tensor(result)

In [22]:
result = preprocessing(vietnam_sentences_valid, vietnamese_to_index, MAX_LENGTH, PADDING_TOKEN,START_TOKEN, END_TOKEN)
result.shape

torch.Size([1790, 102])

In [23]:
class TextDataset(Dataset):

    def __init__(self, english_sentences, vietnam_sentences):
        self.english_sentences = preprocessing(english_sentences, english_to_index, MAX_LENGTH, PADDING_TOKEN, None, None)
        self.vietnam_sentences = preprocessing(vietnam_sentences, vietnamese_to_index, MAX_LENGTH,START_TOKEN, END_TOKEN, PADDING_TOKEN)

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        return self.english_sentences[idx], self.vietnam_sentences[idx]

In [24]:
data_train = TextDataset(english_sentences_train,vietnam_sentences_train)
print(len(data_train))
data_train[1]

931237


(tensor([ 0, 67,  1, 52, 40, 47, 53, 39, 40, 52,  1, 55, 37,  1, 55, 47, 53, 44,
         36,  1, 39, 47,  1, 52, 47,  1, 52, 40, 37,  1, 35, 40, 41, 44, 36, 50,
         37, 46,  8, 51,  1, 40, 47, 45, 37, 15, 85, 85, 85, 85, 85, 85, 85, 85,
         85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85,
         85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85,
         85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 86]),
 tensor([  0, 177,  79,  78, 126,  78,  67,  68,  73, 126,  52,  68, 103,  78,
          67, 126, 101,  33, 126,  78,  61,  78, 126,  54,  62,  78, 126,  77,
          34,  69, 126,  46,  77, 140, 126, 251, 251, 251, 251, 251, 251, 251,
         251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251,
         251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251,
         251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251,
         251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251

In [25]:
data_valid = TextDataset(english_sentences_valid,vietnam_sentences_valid)
print(len(data_valid))
data_valid[1]

1790


(tensor([ 0, 67,  8, 45,  1, 51, 47, 50, 50, 57, 15,  1, 67,  1, 33, 45,  1, 46,
         37, 50, 54, 47, 53, 51,  1, 52, 47, 36, 33, 57, 15,  1, 67,  1, 40, 33,
         36,  1, 34, 33, 36,  1, 36, 50, 37, 33, 45, 51, 15, 85, 85, 85, 85, 85,
         85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85,
         85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85,
         85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 86]),
 tensor([  0, 240,  69,  78, 126,  76,  95,  69, 138, 126,  68,  85,  77, 126,
          78,  33, 116, 126, 101,  85,  69, 126, 101,  68,  46, 116, 126,  75,
          68,  80, 126,  52,  68,  74, 102, 126, 226,  86,  69, 126,  98, 102,
          33, 126, 101,  85,  69, 126,  54,  37, 126,  67,  44,  97, 126,  34,
          52, 126,  77,  90,  78,  67, 126, 251, 251, 251, 251, 251, 251, 251,
         251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251,
         251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251

In [26]:
BATCH_SIZE = 30

train_loader = DataLoader(data_train, BATCH_SIZE)
valid_loader = DataLoader(data_valid, len(data_valid))
iterator = iter(train_loader)

In [27]:
for batch_num, batch in enumerate(iterator):
    print(batch)
    if batch_num > 1:
        break

[tensor([[ 0, 81, 40,  ..., 85, 85, 86],
        [ 0, 67,  1,  ..., 85, 85, 86],
        [ 0, 67, 51,  ..., 85, 85, 86],
        ...,
        [ 0, 67,  8,  ..., 85, 85, 86],
        [ 0, 67, 52,  ..., 85, 85, 86],
        [ 0, 73, 40,  ..., 85, 85, 86]]), tensor([[  0, 177,  34,  ..., 251, 251, 252],
        [  0, 177,  79,  ..., 251, 251, 252],
        [  0, 177,  80,  ..., 251, 251, 252],
        ...,
        [  0, 226,  85,  ..., 251, 251, 252],
        [  0, 177,  36,  ..., 251, 251, 252],
        [  0, 101,  85,  ..., 251, 251, 252]])]
[tensor([[ 0, 70, 37,  ..., 85, 85, 86],
        [ 0, 83, 47,  ..., 85, 85, 86],
        [ 0, 59, 51,  ..., 85, 85, 86],
        ...,
        [ 0, 59, 46,  ..., 85, 85, 86],
        [ 0, 67,  1,  ..., 85, 85, 86],
        [ 0, 67,  1,  ..., 85, 85, 86]]), tensor([[  0, 193,  37,  ..., 251, 251, 252],
        [  0, 179,  87,  ..., 251, 251, 252],
        [  0, 177,  68,  ..., 251, 251, 252],
        ...,
        [  0, 177,  80,  ..., 251, 251, 252],


- Setup model

In [28]:
model = Transformer(d_model=512,
                    vocab_size=len(english_vocabulary),
                    target_vocab_size=len(vietnamese_vocabulary),
                    max_length_seq=MAX_LENGTH + 2,
                    num_blocks=2,
                    expansion_factor=4,
                    num_heads=8
                   )

In [29]:
model

Transformer(
  (encoder): Encoder(
    (dropout): Dropout(p=0.1, inplace=False)
    (token_emb): TokenEmbedding(
      (embedding_layer): Embedding(87, 512)
    )
    (pos_encode): PositionalEncoding(
      (dropout): Dropout(p=0, inplace=False)
    )
    (transformer_blocks): ModuleList(
      (0-1): 2 x TransformerBlock(
        (multihead_attention): MultiHeadAttention(
          (query): Linear(in_features=64, out_features=64, bias=False)
          (key): Linear(in_features=64, out_features=64, bias=False)
          (value): Linear(in_features=64, out_features=64, bias=False)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): ReLU()
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (dropout): Dropout(p=0.1, inplace=False)

In [30]:
# Loss Function
criterian = nn.CrossEntropyLoss(ignore_index=vietnamese_to_index[PADDING_TOKEN],
                                reduction='none')

# Initialize weight
for params in model.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

# optimize
optim = torch.optim.Adam(model.parameters(), lr=1e-4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

- Training

In [31]:
model.train()
model.to(device)
loss_train = []
loss_valid = []
history = {}
epochs = 5

for epoch in range(1,epochs+1):
    print(f'Epoch {epoch} ' + '-' * (20- len(str(epoch))))
    iterator = iter(train_loader)
    length_iter = len(iterator)
    # Training
    for batch_num, batch in enumerate(iterator):
        # Training mode
        model.train()
        # Reset Gradient from Backward Pass
        optim.zero_grad()

        # Predict
        language_input = batch[0].to(device)
        language_output = batch[1].to(device)
        predictions = model(language_input, language_output)

        # Loss
        labels = language_output.view(-1).to(device)
        loss = criterian(
            predictions.view(-1, len(vietnamese_vocabulary)),
            labels
        ).to(device)
        ignore_pad = torch.where(labels == vietnamese_to_index[PADDING_TOKEN], False, True)
        loss = loss.sum() / ignore_pad.sum()

        # Backward and Optimize
        loss.backward()
        optim.step()

        # History
        loss_train.append(loss.item())

    # Validation
    with torch.no_grad():
        iterator = iter(valid_loader)
        for batch_num, batch in enumerate(iterator):
            # Valid model
            model.eval()

            # Predict
            language_input = batch[0].to(device)
            language_output = batch[1].to(device)
            predictions = model(language_input, language_output)

            # Loss
            labels = language_output.view(-1).to(device)
            loss = criterian(
                predictions.view(-1, len(vietnamese_vocabulary)),
                labels
            ).to(device)
            ignore_pad = torch.where(labels == vietnamese_to_index[PADDING_TOKEN], False, True)
            loss = loss.sum() / ignore_pad.sum()

            # History
            loss_valid.append(loss.item())

    # Save history
    history[epoch] = [sum(loss_train) / len(loss_train), sum(loss_valid) / len(loss_valid)]

    # Result Train & Valid
    print(f'{length_iter}/{length_iter}: Training loss: {history[epoch][0]} - Validation loss: {history[epoch][1]}')   

Epoch 1 -------------------


KeyboardInterrupt: 

{1: [1, 2, 3]}