# A3 - Neural Machine Translation (Myanmar to English)

In [63]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext, datasets, math
from tqdm import tqdm # progress bar
import pickle

## 1. ETL: Load Data

In [64]:
dataset = datasets.load_dataset('alt')

In [65]:
dataset

DatasetDict({
    train: Dataset({
        features: ['SNT.URLID', 'SNT.URLID.SNTID', 'url', 'translation'],
        num_rows: 18088
    })
    validation: Dataset({
        features: ['SNT.URLID', 'SNT.URLID.SNTID', 'url', 'translation'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['SNT.URLID', 'SNT.URLID.SNTID', 'url', 'translation'],
        num_rows: 1019
    })
})

In [66]:
dataset['train']

Dataset({
    features: ['SNT.URLID', 'SNT.URLID.SNTID', 'url', 'translation'],
    num_rows: 18088
})

In [67]:
dataset['train'][12]

{'SNT.URLID': '87564',
 'SNT.URLID.SNTID': '13',
 'url': 'http://en.wikinews.org/wiki/Data_for_3_million_UK_driving_candidates_lost',
 'translation': {'bg': 'অক্টোবরে এইচএম রেভিনিউ ২৫ মিলিয়ন লোকের তথ্য হারিয়ে ফেলার পরে এটিই হল ইউকে-তে প্রথম এত বড় তথ্যের ক্ষতি।',
  'en': 'It is the first major loss of data in the UK since information on 25 million people was lost by HM Revenue in October.',
  'en_tok': 'It is the first major loss of data in the UK since information on 25 million people was lost by HM Revenue in October .',
  'fil': 'Ito ang unang malakihang pagkawala ng data sa UK dahil ang impormasyon sa 25 milyong tao ay nawalan ng HM Revenue noong Oktubre.',
  'hi': 'यह ब्रिटेन में डेटा का पहला बड़ा नुकसान है क्योंकि अक्टूबर में HM रेवेन्यू द्वारा 25 मिलियन लोगों की जानकारी गुम हो गई थी।',
  'id': 'Ini adalah kehilangan data yang besar pertama di UK sejak hilangnya informasi tentang 25 juta orang oleh HM Revenue di bulan Oktober.',
  'ja': 'これは、10月に歳入関税庁が2500万人分の情報を失って以来初めてのイギリスでの大きな

In [68]:
dataset['train'][11]['translation']

{'bg': 'গতকাল ১৭৩০ ইউটিসি-তে হাউস অফ্\u200c কমন্\u200cস্\u200c-এ ইউকে-র পরিবহন সচিব রুথ কেলি এই তথ্যগুলি দিয়েছেন।',
 'en': 'Details were given by the UK Transport Secretary, Ruth Kelly, in the House of Commons at 1730 UTC yesterday.',
 'en_tok': 'Details were given by the UK Transport Secretary , Ruth Kelly , in the House of Commons at 1730 UTC yesterday .',
 'fil': 'Ang mga detalye ay ibinigay ng UK transport Secretary, na si Ruth Kelly, sa House of Commons sa ika-17:30 UTC kahapon.',
 'hi': 'कल ब्रिटेन के परिवहन सचिव रूथ केली द्वारा 1730 UTC पर हाउस ऑफ़ कॉमन्स में विवरण दिए गए।',
 'id': 'Detil diberikan oleh Sekretaris Kementerian Transportasi UK, Ruth Kelly, di Dewan Perwakilan Rakyat kemarin 17:30 UTC.',
 'ja': '詳細は昨日UTC17時30分、英国議会でイギリスのルス・ケリー運輸大臣によって伝えられた。',
 'khm': 'ព័ត៌មានលំអិតត្រូវបានផ្តល់ដោយរដ្ឋមន្ត្រីដឹកជញ្ចូន លោករ៉ូថ ខេលលី នៅក្នុងសភានៅម៉ោង1730ម្សិលមិញ។',
 'lo': 'ຂໍ້ມູນໄດ້ຖືກສະໜອງໂດຍ ເລຂາທິການຂົນສົ່ງ ສະຫະລາຊະອານາຈັກ ຣູດ ເຄລີ່ ໃນສະພາຕໍ່າ ທີ່ 1730 UTC ມື້ວານນີ້.',
 'ms': 'Butir

In [69]:
dataset['train'][11]['translation']['en']

'Details were given by the UK Transport Secretary, Ruth Kelly, in the House of Commons at 1730 UTC yesterday.'

In [70]:
dataset['train'][11]['translation']['my']

'အသေးစိတ်များ ကို မနေ့က ၁၇၃၀ ယူတီစီ ၌ အောက်လွှတ်တော် ရှိ ဗြိတိန်နိုင်ငံ ပို့ဆောင်ရေး အတွင်းရေးမှူး ရုသ်ကယ်လီ က ပေးခဲ့သည် ။'

Create a new dataset that will contain only Myanmar language as a target language and English language as a source language.

In [71]:
datasetENMY = {}

# Define source and target languages
SRC_LANGUAGE = 'en'  # Source language is English
TRG_LANGUAGE = 'my'  # Target language is Myanmar
languages   = [SRC_LANGUAGE, TRG_LANGUAGE]

for data in dataset:
# english myanmar data
    datasetENMY[data] = [{lang: row['translation'][lang] for lang in languages} for row in dataset[data]]

In [72]:
# check the size for each dataset
for data in datasetENMY:
    print(f"{data} : {len(datasetENMY[data])}")

train : 18088
validation : 1000
test : 1019


In [73]:
sample = datasetENMY['train'][100]

In [74]:
sample[SRC_LANGUAGE]

'The TimesOnline reports that the joke fell flat with Jeffrey Turner, who as Chief of Police in Clayton County, Georgia, put Mr Whitton on medical leave when he was shot in the wrist as he tried to foil a robbery earlier this summer.'

In [75]:
sample[TRG_LANGUAGE]

'ယခု နွေရာသီ အစောပိုင်း ကာလ ၌ လုယက်မှု တစ်ခု ကျူးလွန် ရန် ကြိုးစားခဲ့ သောကြောင့် လက်ကောက်၀တ် တွင် သေနတ်ကျည်မှန်ခဲ့သော မစ်စတာ ၀ှစ်တွန် ကို ကလေတွန် ကောင်တီ ၊ ဂျော်ဂျီယာပြည်နယ် မှ ၊ ရဲမှူးကြီး ဂျက်ဖရီ တာနာ မှ ဆေး ခွင့် ပေးတာနှင့် ပတ်သတ်ပြီး ၊ တိုင်းမ်စ်အွန်လိုင်း မှ ဟာသပြက်လုံးတစ်ခု ကို မှတ်တမ်းတင်ရေးသားခဲ့သည် ။'

## 2. Data Preprocessing

### 2.1 Tokenization

In [76]:
# place holders
token_transform = {}
vocab_transform = {}

##### source language <"ENG"> tokenization

In [77]:

from torchtext.data.utils import get_tokenizer
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language = 'en_core_web_sm')

In [78]:
sample[SRC_LANGUAGE]
token_transform[SRC_LANGUAGE](sample[SRC_LANGUAGE])

['The',
 'TimesOnline',
 'reports',
 'that',
 'the',
 'joke',
 'fell',
 'flat',
 'with',
 'Jeffrey',
 'Turner',
 ',',
 'who',
 'as',
 'Chief',
 'of',
 'Police',
 'in',
 'Clayton',
 'County',
 ',',
 'Georgia',
 ',',
 'put',
 'Mr',
 'Whitton',
 'on',
 'medical',
 'leave',
 'when',
 'he',
 'was',
 'shot',
 'in',
 'the',
 'wrist',
 'as',
 'he',
 'tried',
 'to',
 'foil',
 'a',
 'robbery',
 'earlier',
 'this',
 'summer',
 '.']

##### Target Language <"MYR"> tokenization

In [80]:
"""

This code is updated version of this: https://gist.github.com/markdtw/e2a4e2ee7cef8ea6aed33bb47a97fba6
Ye Kyaw Thu, LST, NECTEC, Thailand updated followings:
-- added recursion limit
-- changed P_unigram and P_bigram as module level global variable
-- using binary ngram dictionary
--  set N value of this: "def __init__(self, datafile=None, unigram=True, N=102490):"
-- Last Updated: 5 Sept 2021

# References:
- Python implementation of Viterbi algorithm for word segmentation: 
- Updated version of this: https://gist.github.com/markdtw/e2a4e2ee7cef8ea6aed33bb47a97fba6
- A clean-up of this: http://norvig.com/ngrams/ch14.pdf
- For recursion limit: https://www.geeksforgeeks.org/python-handling-recursion-limit/
- A. Viterbi, "Error bounds for convolutional codes and an asymptotically optimum decoding algorithm," in IEEE Transactions on Information Theory, vol. 13, no. 2, pp. 260-269, April 1967, doi: 10.1109/TIT.1967.1054010.

"""

import math
import functools
import sys
import pickle

sys.setrecursionlimit(10**6)

uni_dict_bin = './data/unigram-word.bin'
bi_dict_bin = './data/bigram-word.bin'                

def read_dict (fileDICT):
    try:
        with open(fileDICT, 'rb') as input_file:
            dictionary = pickle.load(input_file)
            input_file.close()
    except FileNotFoundError:
        print('Dictionary file', fileDICT, ' not found!')
    return dictionary

class ProbDist(dict):
    ### Probability distribution estimated from unigram/bigram data
    def __init__(self, datafile=None, unigram=True, N=102490):
    #def __init__(self, datafile=None, unigram=True, N=1024908267229):
    #def __init__(self, datafile=None, unigram=True, N=8199266137832):
        #data = {}
        data = read_dict(datafile)
        for k, c in data.items():
            self[k] = self.get(k, 0) + c

        if unigram:
            self.unknownprob = lambda k, N: 10 / (N*10**len(k))    # avoid unknown long word
        else:
            self.unknownprob = lambda k, N: 1 / N

        self.N = N

    def __call__(self, key):
        if key in self:
            return self[key]/self.N
        else:
            return self.unknownprob(key, self.N)
        

P_unigram = ProbDist(uni_dict_bin, True)
P_bigram = ProbDist(bi_dict_bin, False)


def conditionalProb(word_curr, word_prev):
    ### Conditional probability of current word given the previous word.
    try:
        return P_bigram[word_prev + ' ' + word_curr]/P_unigram[word_prev]
    except KeyError:
        return P_unigram(word_curr)


@functools.lru_cache(maxsize=2**10)
#maxlen=20
def viterbi(text, prev='<S>', maxlen=20):
    if not text:
        return 0.0, []
    
    #print("text: ", text)
    textlen = min(len(text), maxlen)
    splits = [(text[:i + 1], text[i + 1:]) for i in range(textlen)]

    candidates = []
    #print("clear candidates!  candidates = []")
    for first_word, remain_word in splits:
        #pdb.set_trace()
        first_prob = math.log10(conditionalProb(first_word, prev))
        #print("first_prob of condProb(", first_word, ", ", prev, "): ", first_prob )
        remain_prob, remain_word = viterbi(remain_word, first_word)
        #print("remain_prob: ", remain_prob, ", remain_word: ", remain_word)
        candidates.append((first_prob + remain_prob, [first_word] + remain_word))
        #print("first_prob: ", str(first_prob), ", remain_prob: ", remain_prob, ", [first_word]:", [first_word], ", remain_word: ", remain_word)
        #print("Candidates: ", candidates)
        
    #print("max(candidates): " + str(max(candidates)))
    #print("====================")
    return max(candidates)

In [81]:
def my_tokenizer(text):
    if text is None:
        return []
    wordDelimiter= '|' # assign local variable delimiter

    input = text[:]
    # text = corpus['train'][0][TRG_LANGUAGE]
    listString = viterbi(input.replace(" ", "").strip()) # remove space between words and pass to viterbi()
    # print("listString: " + str(listString))
    wordStr = wordDelimiter.join(listString[1])
    wordClean1=wordStr.strip()
    wordClean2=wordClean1.strip(wordDelimiter)    
    wordClean2 = wordClean2.split('|')                
    return wordClean2

In [83]:
sample

{'en': 'The TimesOnline reports that the joke fell flat with Jeffrey Turner, who as Chief of Police in Clayton County, Georgia, put Mr Whitton on medical leave when he was shot in the wrist as he tried to foil a robbery earlier this summer.',
 'my': 'ယခု နွေရာသီ အစောပိုင်း ကာလ ၌ လုယက်မှု တစ်ခု ကျူးလွန် ရန် ကြိုးစားခဲ့ သောကြောင့် လက်ကောက်၀တ် တွင် သေနတ်ကျည်မှန်ခဲ့သော မစ်စတာ ၀ှစ်တွန် ကို ကလေတွန် ကောင်တီ ၊ ဂျော်ဂျီယာပြည်နယ် မှ ၊ ရဲမှူးကြီး ဂျက်ဖရီ တာနာ မှ ဆေး ခွင့် ပေးတာနှင့် ပတ်သတ်ပြီး ၊ တိုင်းမ်စ်အွန်လိုင်း မှ ဟာသပြက်လုံးတစ်ခု ကို မှတ်တမ်းတင်ရေးသားခဲ့သည် ။'}

In [84]:
token_transform[TRG_LANGUAGE] = my_tokenizer

In [86]:
token_transform[TRG_LANGUAGE](sample[TRG_LANGUAGE])

['ယခု',
 'နွေရာသီ',
 'အစောပိုင်း',
 'ကာလ',
 '၌',
 'လုယက်',
 'မှု',
 'တစ်',
 'ခု',
 'ကျူးလွန်',
 'ရန်',
 'ကြိုးစား',
 'ခဲ့',
 'သော',
 'ကြောင့်',
 'လက်',
 'ကောက်',
 '၀',
 'တ်',
 'တွင်',
 'သေနတ်ကျည်',
 'မှန်',
 'ခဲ့',
 'သော',
 'မ',
 'စ်',
 'စ',
 'တာ',
 '၀ှစ်',
 'တွန်',
 'ကို',
 'က',
 'လေ',
 'တွန်',
 'ကောင်',
 'တီ',
 '၊',
 'ဂျော်ဂျီယာ',
 'ပြည်နယ်',
 'မှ',
 '၊',
 'ရဲမှူးကြီး',
 'ဂျက်',
 'ဖရီ',
 'တာ',
 'နာ',
 'မှ',
 'ဆေး',
 'ခွင့်',
 'ပေး',
 'တာ',
 'နှင့်',
 'ပတ်',
 'သတ်',
 'ပြီး',
 '၊',
 'တိုင်းမ်စ်',
 'အွန်လိုင်း',
 'မှ',
 'ဟာသ',
 'ပြက်လုံး',
 'တစ်',
 'ခု',
 'ကို',
 'မှတ်တမ်းတင်',
 'ရေး',
 'သား',
 'ခဲ့',
 'သည်',
 '။']

In [87]:
import copy

corpus = copy.deepcopy(datasetENMY)

### Numericalization

In [88]:
# Define special symbols and indices
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3

# make sure the tockens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<sos>', '<eos>']

In [91]:
# helper function to yield list of tokens
# here data can be 'train' or 'val' or 'test' 
def yield_tokens(data, language):
    # language_index = {SRC_LANGUAGE: 0, TRG_LANGUAGE:1}
    
    for data_sample in data:
        yield token_transform[language](data_sample[language])
        # either first or second index

In [92]:
from torchtext.vocab import build_vocab_from_iterator

for ln in [SRC_LANGUAGE, TRG_LANGUAGE]:
    # Create torchtext's Vocab object 
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(corpus['train'], ln), 
                                                    min_freq = 2,   # if not, everything will be treated as UNK
                                                    specials = special_symbols,
                                                    special_first = True) # indicates whether to insert symbols at the beginning or at the end                                            
# Set UNK_IDX as the default index. This index is returned when the token is not found. 
# If not set, it throws RuntimeError when the queried token is not found in the Vocabulary. 
for ln in [SRC_LANGUAGE, TRG_LANGUAGE]:
    vocab_transform[ln].set_default_index(UNK_IDX)