In [1]:
# import all needed jupyter notebooks and library
from ipynb.fs.full.batch import *
from ipynb.fs.full.vocab import *
from ipynb.fs.full.encoder_decoder import *
from torch import optim
import pickle
import glob

In [2]:
# randomly taken some news articles
en_sentence = ["Bangladesh is yet to conduct an epidemiological surveillance which experts say is very important to understand the gravity of the coronavirus situation and will help take any policy decision in this regard".lower(), "They said such surveillance would have allowed health officials to suggest the government about the next course of action like when and how the ongoing shutdown should be relaxed or withdrawn which area is highly vulnerable and what measures to be taken".lower()]
bn_sentence = ["এক প্রশ্নের জবাবে আইনমন্ত্রী বলেন অধ্যাদেশ অনুসারে প্রয়োজনবোধে যেকোনো সময় ডিজিটাল পদ্ধতিতে মামলার শুনানি ও নিষ্পত্তি করতে অডিও ও ভিডিও কনফারেন্সের মাধ্যমে আদালতের কার্যক্রম পরিচালনা করা যাবে", "চলমান করোনাভাইরাস প্রাদুর্ভাবের কারণে গত মার্চ থেকে আপিল বিভাগ ও হাইকোর্ট বিভাগসহ দেশের সব আদালতের নিয়মিত কার্যক্রম বন্ধ রয়েছে চলমান বন্ধের সময় ভার্চুয়াল আদালত পরিচালনার অনুমতি দেওয়ার জন্য একাধিক আইনজীবী প্রধান বিচারপতিকে অনুরোধ করেছিলেন প্রধান বিচারপতি সৈয়দ মাহমুদ হোসেন আপিল"]

In [3]:
# create LangVocab object and sort the dictionary
en_vocab = LangVocab('en')
bn_vocab = LangVocab('bn')
for i in range(2):
    en_vocab.sentence_To_word(en_sentence[i])
    bn_vocab.sentence_To_word(bn_sentence[i])
en_vocab.sort_dict()
bn_vocab.sort_dict()

In [4]:
# initialize word dimentions to convert
hidden_size = 300

# initialize for holding pre-trained word embedding vetors
weights_en = np.zeros((en_vocab.num_token, hidden_size))
weights_bn = np.zeros((bn_vocab.num_token, hidden_size))

# randomly (normal distribution) initialize first 4 token of vocabulary [_sos, _eos, _unk, _pad] 
for i in range(4):
    a = np.random.normal(scale=0.6, size=(1, hidden_size))
    weights_en[i] = a
    weights_bn[i] = a

In [5]:
# pre-trained embeddings vectors load from glove word embedding
# english total token 400001
# bengali total token 178153
# load the pre-trained word embeddings vector into dictionary
en_dict = {}
bn_dict = {}
with open('dict_embed_weights/en_dict.txt', 'rb') as f:
    en_dict = pickle.load(f)
with open('dict_embed_weights/bn/bn_dict.txt', 'rb') as f:
    bn_dict = pickle.load(f)

In [6]:
# count the words number found in pre-trained word embeddings dictionary
words_found_en = 0
words_found_bn = 0
for i in range(4, en_vocab.num_token):
    
    # choose the pre-trained embeddings vector of corresponding word for english
    # if not found then initialize it with random values (normal distribution)
    try: 
        weights_en[i] = en_dict[en_vocab.index_To_token[i]]
        words_found_en += 1
    except KeyError:
        weights_en[i] = np.random.normal(scale=0.6, size=(1, hidden_size))
for i in range(4, bn_vocab.num_token):
    
    # choose the pre-trained embeddings vector of corresponding word for bangla
    # if not found then initialize it with random values (normal distribution)
    try: 
        weights_bn[i] = bn_dict[bn_vocab.index_To_token[i]]
        words_found_bn += 1
    except KeyError:
        weights_bn[i] = np.random.normal(scale=0.6, size=(1, hidden_size))

print("Number of token in original : " + str(en_vocab.num_token))
print("Number of token found : " + str(words_found_en))
print("Number of token in original : " + str(bn_vocab.num_token))
print("Number of token found : " + str(words_found_bn))

Number of token in original : 63
Number of token found : 59
Number of token in original : 63
Number of token found : 58


In [7]:
# create batch
length = en_vocab.max_length if en_vocab.max_length > bn_vocab.max_length else bn_vocab.max_length
batch_size = 1
batch = Batch(batch_size = batch_size, max_length = length, seed = 1)
batches = batch.get_batches(in_lang = en_vocab, tar_lang = bn_vocab, in_sentences = en_sentence, tar_sentences = bn_sentence)

In [8]:
# separate inputs and targets vector
inputs = batches[0][0]
targets = batches[0][1]

In [9]:
# initialize encoder and decoder models
encoder = LstmEncoder(in_vocab_size = en_vocab.num_token, word_dim=hidden_size, pad_idx = en_vocab.token_To_index['_pad'])
decoder = LstmDecoder(tar_vocab_size = bn_vocab.num_token, word_dim = hidden_size, pad_idx = bn_vocab.token_To_index['_pad'])

# learning rate
lr = 0.001

# initialize optimizers
encoder_optimizer = optim.Adam(encoder.parameters(), lr=lr)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=lr)

# randomly print a embedding vector before initialize with pre-trained vectors
print("Before initialize pre-trained embedding vector encoder : " + str(encoder.word_embeds(torch.LongTensor([11]))))
print("Before initialize pre-trained embedding vector decoder : " + str(decoder.word_embeds(torch.LongTensor([12]))))

# load the pre-trained embeddings vectors into models
encoder.load_embed_weights(weights_en)
decoder.load_embed_weights(weights_bn)

# print above embedding vector after initialize with pre-trained vectors
print("After initialize pre-trained embedding vector encoder : " + str(encoder.word_embeds(torch.LongTensor([11]))))
print("After initialize pre-trained embedding vector decoder : " + str(decoder.word_embeds(torch.LongTensor([12]))))

Before initialize pre-trained embedding vector encoder : tensor([[ 1.4147, -0.0854,  0.9175,  0.0181,  0.3739, -0.1630, -0.8463, -0.6528,
         -0.7771, -1.1314, -0.8318, -0.5424, -0.5617,  0.5578, -0.7622,  0.3717,
         -1.3642,  0.1076, -1.5307, -1.0777, -0.6573,  1.7305,  0.8068, -0.4060,
          0.3648,  1.2223, -1.8405, -0.5603, -1.0498, -2.3492, -0.7260, -0.8321,
          1.5374,  1.1163,  0.9164,  0.0442,  0.2131, -0.6654, -0.5569,  0.9620,
         -0.9159,  0.2355,  0.8549, -0.2426,  0.4443,  0.8974, -0.2509, -2.3011,
          0.3318,  0.0437, -0.4704, -0.1118, -0.0834, -2.3521,  0.0058,  0.5663,
          0.3209, -0.0513, -0.7264,  0.0960,  0.6562,  0.2912, -0.5171,  0.2734,
          0.5427, -0.3862,  1.1704, -0.4868,  1.1555,  1.2038, -0.5833,  0.7980,
          0.5164, -0.8668,  0.0736, -0.3150, -0.9301,  1.2166, -1.2124,  0.1488,
          0.5701,  1.7268,  0.0054, -0.2173, -0.4522, -1.1454, -0.0724,  0.9688,
         -1.0323, -0.1418,  0.4780,  1.2733,  1.7928

          1.2915e+00,  1.2496e-01, -2.5026e-01, -8.3615e-02, -3.9094e-01]])
After initialize pre-trained embedding vector decoder : tensor([[ 3.2509e-01, -8.6098e-01, -2.6855e-01, -4.1356e-01,  1.1385e-01,
         -3.1579e-01, -5.6923e-01,  9.3286e-02,  7.3572e-01,  5.4303e-01,
         -8.4033e-01, -5.2228e-01, -1.8866e-01, -5.1262e-01, -4.6871e-01,
          4.7990e-01, -1.7242e-01,  3.9985e-01,  3.2247e-01,  3.3591e-01,
          7.6855e-01, -3.4168e-01,  2.9821e-01, -1.4426e-01,  4.6129e-01,
         -4.1432e-01,  4.9753e-01,  5.1368e-01, -2.3591e-01,  4.2208e-01,
          1.2588e-01, -4.7551e-01,  8.5961e-01, -4.9544e-01, -2.1849e-01,
         -8.0713e-02,  1.5788e-01, -3.3049e-02,  2.0216e-02,  2.5371e-01,
         -2.4025e-01, -2.0854e-01,  3.5700e-01,  2.3532e-01,  2.8633e-02,
         -4.2727e-01, -5.5547e-01,  2.5212e-01,  8.4813e-02,  5.9048e-01,
          2.7492e-01,  2.9421e-01,  1.8115e-01,  3.2985e-01, -7.1027e-02,
          5.8751e-01, -1.9190e-01,  3.4065e-02,  2.478

In [44]:
encoder.train()
decoder.train()
loss = 0

# initialize encoder hidden
hidden = encoder.initialize_hidden((len(en_sentence)))
e_out, e_hidden = encoder(torch.from_numpy(inputs).long(), hidden)

# print a random embedding vector before train model
print("Before training encoder : " + str(encoder.word_embeds(torch.LongTensor([21]))))
print("Before training decoder : " + str(decoder.word_embeds(torch.LongTensor([23]))))

# clear old gradient from last step
# need to clear after every optimizer step or it accumulate all gradients
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
for i in range(len(en_sentence)):
    
    # initailize decoder hidden with encoder hidden for each sentence
    decoder_hidden = (e_hidden[0][0][i].view(1,1,-1),e_hidden[1][0][i].view(1,1,-1))
    
    # initial input for decoder
    decoder_input = torch.LongTensor([bn_vocab.token_To_index['_sos']])
    output = []
    for j in range(length):
        decoder_output, decoder_hidden = decoder.forward(decoder_input, decoder_hidden)
        
        # choose the top value and it's index
        top_value, top_index = decoder_output.topk(1)
        output.append(decoder_output.view(1,-1))
        
        # initialize the next decoder input
        decoder_input = torch.tensor([top_index.item()])
    
    # calculate loss of a sentence
    loss += F.nll_loss(torch.cat(output), torch.from_numpy(targets[i]).long())
loss = loss / len(en_sentence)

# backward gradient
loss.backward()

# update parameters
encoder_optimizer.step()
decoder_optimizer.step()

# print same embedding vector after train model
print("After training encoder : " + str(encoder.word_embeds(torch.LongTensor([21]))))
print("After training decoder : " + str(decoder.word_embeds(torch.LongTensor([23]))))

Before training encoder : tensor([[-1.9563e-01,  4.8127e-01, -2.4670e-01,  1.2751e-01,  3.0997e-01,
         -5.7585e-02,  1.9660e-01,  2.4947e-01,  1.4405e-01, -2.1063e+00,
          3.0094e-01, -1.1903e-01, -2.7087e-01,  2.6418e-01, -2.8135e-02,
         -9.6218e-03, -6.3701e-02, -3.6602e-02, -3.9539e-01, -1.9931e-01,
         -1.3172e-01,  4.1708e-01,  3.8747e-01, -2.4867e-01, -4.0963e-01,
          3.0117e-01, -6.6700e-02, -4.0928e-01, -1.8198e-01,  4.1716e-01,
          5.0072e-01,  4.1207e-01, -2.6712e-01, -1.8796e-01, -8.7328e-01,
         -4.6190e-01,  1.1255e-01,  1.9181e-01,  4.0320e-01, -1.4160e-01,
         -1.8509e-01, -2.7491e-01, -2.1240e-01,  1.8191e-01, -1.3362e-01,
          2.5592e-01, -1.6499e-01,  9.0990e-02, -1.2049e-01,  3.3701e-01,
          1.2667e-01, -5.2719e-01,  1.3359e-01,  1.6317e-01, -1.3175e-02,
          3.6707e-01,  3.0000e-01,  3.5971e-01,  1.9285e-01,  9.2912e-03,
          9.4887e-02,  3.2849e-01,  1.5767e-01,  2.4928e-01, -1.3388e-01,
         -4.

After training encoder : tensor([[-1.9563e-01,  4.8127e-01, -2.4670e-01,  1.2751e-01,  3.0997e-01,
         -5.7585e-02,  1.9660e-01,  2.4947e-01,  1.4405e-01, -2.1063e+00,
          3.0094e-01, -1.1903e-01, -2.7087e-01,  2.6418e-01, -2.8135e-02,
         -9.6218e-03, -6.3701e-02, -3.6602e-02, -3.9539e-01, -1.9931e-01,
         -1.3172e-01,  4.1708e-01,  3.8747e-01, -2.4867e-01, -4.0963e-01,
          3.0117e-01, -6.6700e-02, -4.0928e-01, -1.8198e-01,  4.1716e-01,
          5.0072e-01,  4.1207e-01, -2.6712e-01, -1.8796e-01, -8.7328e-01,
         -4.6190e-01,  1.1255e-01,  1.9181e-01,  4.0320e-01, -1.4160e-01,
         -1.8509e-01, -2.7491e-01, -2.1240e-01,  1.8191e-01, -1.3362e-01,
          2.5592e-01, -1.6499e-01,  9.0990e-02, -1.2049e-01,  3.3701e-01,
          1.2667e-01, -5.2719e-01,  1.3359e-01,  1.6317e-01, -1.3175e-02,
          3.6707e-01,  3.0000e-01,  3.5971e-01,  1.9285e-01,  9.2912e-03,
          9.4887e-02,  3.2849e-01,  1.5767e-01,  2.4928e-01, -1.3388e-01,
         -4.9