In [None]:
import numpy as np
import torch
import math
from torch import nn
import torch.nn.functional as F

In [None]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
    if mask is not None:
        scaled = scaled.permute(1, 0, 2, 3) + mask
        scaled = scaled.permute(1, 0, 2, 3)
    attention = F.softmax(scaled, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = (torch.arange(self.max_sequence_length)
                          .reshape(self.max_sequence_length, 1))
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE

In [None]:
class SentenceEmbedding(nn.Module):
    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = PositionalEncoding(d_model, max_sequence_length)
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN

    def batch_tokenize(self, batch, start_token, end_token):

        def tokenize(sentence, start_token, end_token):
            sentence_word_indicies = [self.language_to_index[token] for token in list(sentence)]
            if start_token:
                sentence_word_indicies.insert(0, self.language_to_index[self.START_TOKEN])
            if end_token:
                sentence_word_indicies.append(self.language_to_index[self.END_TOKEN])
            for _ in range(len(sentence_word_indicies), self.max_sequence_length):
                sentence_word_indicies.append(self.language_to_index[self.PADDING_TOKEN])
            return torch.tensor(sentence_word_indicies)

        tokenized = []
        for sentence_num in range(len(batch)):
           tokenized.append( tokenize(batch[sentence_num], start_token, end_token) )
        tokenized = torch.stack(tokenized)
        return tokenized.to(get_device())

    def forward(self, x, start_token, end_token): # sentence
        x = self.batch_tokenize(x, start_token, end_token)
        x = self.embedding(x)
        pos = self.position_encoder().to(get_device())
        x = self.dropout(x + pos)
        return x

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(d_model , 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, mask):
        batch_size, sequence_length, d_model = x.size()
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, self.num_heads * self.head_dim)
        out = self.linear_layer(values)
        return out

In [None]:
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y + self.beta
        return out

In [None]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x, self_attention_mask):
        residual_x = x.clone()
        x = self.attention(x, mask=self_attention_mask)
        x = self.dropout1(x)
        x = self.norm1(x + residual_x)
        residual_x = x.clone()
        x = self.ffn(x)
        x = self.dropout2(x)
        x = self.norm2(x + residual_x)
        return x

In [None]:
class SequentialEncoder(nn.Sequential):
    def forward(self, *inputs):
        x, self_attention_mask  = inputs
        for module in self._modules.values():
            x = module(x, self_attention_mask)
        return x

In [None]:
class Encoder(nn.Module):
    def __init__(self,
                 d_model,
                 ffn_hidden,
                 num_heads,
                 drop_prob,
                 num_layers,
                 max_sequence_length,
                 language_to_index,
                 START_TOKEN,
                 END_TOKEN,
                 PADDING_TOKEN):
        super().__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.layers = SequentialEncoder(*[EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob)
                                      for _ in range(num_layers)])

    def forward(self, x, self_attention_mask, start_token, end_token):
        x = self.sentence_embedding(x, start_token, end_token)
        x = self.layers(x, self_attention_mask)
        return x


In [None]:
class MultiHeadCrossAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.kv_layer = nn.Linear(d_model , 2 * d_model)
        self.q_layer = nn.Linear(d_model , d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, y, mask):
        batch_size, sequence_length, d_model = x.size() # in practice, this is the same for both languages...so we can technically combine with normal attention
        kv = self.kv_layer(x)
        q = self.q_layer(y)
        kv = kv.reshape(batch_size, sequence_length, self.num_heads, 2 * self.head_dim)
        q = q.reshape(batch_size, sequence_length, self.num_heads, self.head_dim)
        kv = kv.permute(0, 2, 1, 3)
        q = q.permute(0, 2, 1, 3)
        k, v = kv.chunk(2, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask) # We don't need the mask for cross attention, removing in outer function!
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, d_model)
        out = self.linear_layer(values)
        return out


In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.encoder_decoder_attention = MultiHeadCrossAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.layer_norm3 = LayerNormalization(parameters_shape=[d_model])
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, x, y, self_attention_mask, cross_attention_mask):
        _y = y.clone()
        y = self.self_attention(y, mask=self_attention_mask)
        y = self.dropout1(y)
        y = self.layer_norm1(y + _y)

        _y = y.clone()
        y = self.encoder_decoder_attention(x, y, mask=cross_attention_mask)
        y = self.dropout2(y)
        y = self.layer_norm2(y + _y)

        _y = y.clone()
        y = self.ffn(y)
        y = self.dropout3(y)
        y = self.layer_norm3(y + _y)
        return y


In [None]:
class SequentialDecoder(nn.Sequential):
    def forward(self, *inputs):
        x, y, self_attention_mask, cross_attention_mask = inputs
        for module in self._modules.values():
            y = module(x, y, self_attention_mask, cross_attention_mask)
        return y

class Decoder(nn.Module):
    def __init__(self,
                 d_model,
                 ffn_hidden,
                 num_heads,
                 drop_prob,
                 num_layers,
                 max_sequence_length,
                 language_to_index,
                 START_TOKEN,
                 END_TOKEN,
                 PADDING_TOKEN):
        super().__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.layers = SequentialDecoder(*[DecoderLayer(d_model, ffn_hidden, num_heads, drop_prob) for _ in range(num_layers)])

    def forward(self, x, y, self_attention_mask, cross_attention_mask, start_token, end_token):
        y = self.sentence_embedding(y, start_token, end_token)
        y = self.layers(x, y, self_attention_mask, cross_attention_mask)
        return y

In [None]:
class Transformer(nn.Module):
    def __init__(self,
                d_model,
                ffn_hidden,
                num_heads,
                drop_prob,
                num_layers,
                max_sequence_length,
                kn_vocab_size,
                english_to_index,
                bangla_to_index,
                START_TOKEN,
                END_TOKEN,
                PADDING_TOKEN
                ):
        super().__init__()
        self.encoder = Encoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, english_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.decoder = Decoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, bangla_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.linear = nn.Linear(d_model, kn_vocab_size)
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    def forward(self,
                x,
                y,
                encoder_self_attention_mask=None,
                decoder_self_attention_mask=None,
                decoder_cross_attention_mask=None,
                enc_start_token=False,
                enc_end_token=False,
                dec_start_token=True,
                dec_end_token=False):
        x = self.encoder(x, encoder_self_attention_mask, start_token=enc_start_token, end_token=enc_end_token)
        out = self.decoder(x, y, decoder_self_attention_mask, decoder_cross_attention_mask, start_token=dec_start_token, end_token=dec_end_token)
        out = self.linear(out)
        return out


In [None]:
english_file = '/content/drive/MyDrive/Colab Notebooks/AdvanceDeepLearning/Dataset/original_corpus.en'
bangla_file = '/content/drive/MyDrive/Colab Notebooks/AdvanceDeepLearning/Dataset/original_corpus.bn'
START_TOKEN = '<START>'
PADDING_TOKEN = '<PADDING>'
END_TOKEN = '<END>'

bangla_vocabulary = [START_TOKEN, ' ','|', '!', '¡','"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                      '০', '১', '২', '৩', '৪', '৫', '৬', '৭', '৮', '৯', ':', '<', '=', '>', '?', 'ˌ',
                      'অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'ৠ',
                      'এ', 'ঐ', 'ও', 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট', 'ঠ',
                      'ড', 'ঢ', 'ণ', 'ত', 'থ',
                      'দ', 'ধ', 'ন', 'প', 'ফ',
                      'ব', 'ভ', 'ম', 'য', 'শ',
                      'ষ', 'স', 'হ', 'ড়', 'ঢ়',
                      'য়', 'ৎ', 'া','ি', 'ু', 'ৃ',
                      'ৢ', 'ে', 'ো', 'ী', 'ূ', 'ৈ', 'ৌ', '‍ঁ', 'ং', 'ঃ',
                      '‍্', '‍্য', '‍‍্র', 'ক্র', 'চ্ছ্ব', 'ক্ষ', 'ন্ট', 'দ্ধ', 'ন্ধ', 'ণ্ঠ', 'ভ্র', 'স্ত', 'হ্ন', 'ন্ত', 'ত্ব', 'ন্ব', 'ক্স', 'ম্ব', 'জ্ঞ', 'ষ্ট', 'ন্ত্র', 'ক্ত',
                      'ষ্ঠ', 'ন্ন', 'ত্ন', 'দ্দ', 'প্ত', 'হ্ম', 'ব্জ', 'ন্দ', 'ন্ড', 'ব্দ','ষ্ণ', PADDING_TOKEN, END_TOKEN]

english_vocabulary = [START_TOKEN, ' ', '!', '¡','"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                        ':', '<', '=', '>', '?', '@',
                        '[', '\\', ']', '^', '_', '`',
                        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                        'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                        'y', 'z',
                        '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN]


In [None]:
index_to_bangla = {k:v for k,v in enumerate(bangla_vocabulary)}
bangla_to_index = {v:k for k,v in enumerate(bangla_vocabulary)}
index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}

In [None]:
with open(english_file, 'r') as file:
    english_sentences = file.readlines()
with open(bangla_file, 'r') as file:
    bangla_sentences = file.readlines()

# Limit Number of sentences
TOTAL_SENTENCES = 100000
english_sentences = english_sentences[:TOTAL_SENTENCES]
bangla_sentences = bangla_sentences[:TOTAL_SENTENCES]
english_sentences = [sentence.rstrip('\n').lower() for sentence in english_sentences]
bangla_sentences = [sentence.rstrip('\n') for sentence in bangla_sentences]

In [None]:
english_sentences[:10]

['he turned to look langdon in the eye.',
 'better o . guaranteeing that every individual will be free to do as he wishes',
 '"what do you say i tell you over dinner."',
 'when i was just about to say good-night to the assembly and to leave, a man came after me quickly and introduced himself.',
 'let me pass."',
 'now the camerlegno turned and addressed the remaining guards.',
 '"where is their taxi now?',
 "evidence for religion, commerce and social stratification. most researchers believe that these unprecedented accomplishments were the product of a revolution in sapiens' cognitive abilities.",
 'for decades, palaeontologists and zooarchaeologists - people who search for and study animal remains - have been combing the plains and mountains of the americas in search of the fossilised bones of ancient camels and the petri ed faeces of giant ground sloths.',
 'it represents a kind of school for adults.']

In [None]:
bangla_sentences[:10]

['সে ঘুরে ল্যাংডনের চোখের দিকে তাকায়।',
 'সব মানুষের মধ্যে সমতা আনতে হলে কারও না কারও স্বাধীনতায় হস্তক্ষেপ করতেই হবে।',
 'যাই জিজ্ঞেস কর না কেন, সব প্রশ্নের জবাব দিব ডিনারের পর।',
 'ঠিক যখন আমি সভাকে শুভরাত জানিয়ে বিদায় নিতে উদ্যত, একজন লোক সত্বর আমার কাছে এসে নিজের পরিচয় দেয়।',
 'যেতে দিন!',
 'এবার ক্যামারলেনগো ঘুরে দাঁড়াল আবার, তাকাল অন্য সৈনিকদের দিকে, জোয়ানগণ, আমি আর কোন প্রাণঘাতি ঘটনা দেখতে চাই না এই সন্ধ্যায়।',
 'তাদের ট্যাক্সি এখন কোথায়?',
 'বেশিরভাগ গবেষকই মনে করেন যে, এতসব গুরুত্বপূর্ণ আবিষ্কারের পেছনে নিশ্চয়ই সেপিয়েন্সদের বুদ্ধিবৃত্তিক দক্ষতার কোনো পরিবর্তন দায়ী।',
 'দশকের পর দশক ধরে এসব প্রাণীর জীবাশ্ম আর দেহাবশেষের খোঁজে দুই আমেরিকার পাহাড় ও সমতলে চষে বেড়াচ্ছেন বিশেষজ্ঞরা। যখনই তাঁরা কোনো কিছু খুঁজে পাচ্ছেন পরম যত্নে সেগুলো পাঠিয়ে দিচ্ছেন গবেষণাগারে।',
 'এটা প্রাপ্ত বয়স্কদের জন্য এক রকমের স্কুলও বলা চলে।']

In [None]:
import numpy as np
PERCENTILE = 97
print( f"{PERCENTILE}th percentile length Bangla: {np.percentile([len(x) for x in bangla_sentences], PERCENTILE)}" )
print( f"{PERCENTILE}th percentile length English: {np.percentile([len(x) for x in english_sentences], PERCENTILE)}" )


97th percentile length Bangla: 207.0
97th percentile length English: 228.0


In [None]:
max_sequence_length = 250

def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1) # need to re-add the end token so leaving 1 space

valid_sentence_indicies = []
for index in range(len(bangla_sentences)):
    bangla_sentence, english_sentence = bangla_sentences[index], english_sentences[index]
    if is_valid_length(bangla_sentence, max_sequence_length) \
      and is_valid_length(english_sentence, max_sequence_length) \
      and is_valid_tokens(bangla_sentence, bangla_vocabulary):
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(bangla_sentences)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")

Number of sentences: 100000
Number of valid sentences: 945


In [None]:
bangla_sentences = [bangla_sentences[i] for i in valid_sentence_indicies]
english_sentences = [english_sentences[i] for i in valid_sentence_indicies]

In [None]:
bangla_sentences[:10]

['যেতে দিন!',
 'কীভাবে?',
 'দেখ!',
 'তিন...',
 'টেনশন?',
 'কখনও!',
 'তাই নাকি?',
 'হুম!',
 'পিশাচ!',
 'দশ মিনিট?']

In [None]:
import torch

d_model = 512
batch_size = 64
ffn_hidden = 2048
num_heads = 8
drop_prob = 0.1
num_layers = 5
max_sequence_length = 250
bn_vocab_size = len(bangla_vocabulary)

transformer = Transformer(d_model,
                          ffn_hidden,
                          num_heads,
                          drop_prob,
                          num_layers,
                          max_sequence_length,
                          bn_vocab_size,
                          english_to_index,
                          bangla_to_index,
                          START_TOKEN,
                          END_TOKEN,
                          PADDING_TOKEN)

In [None]:
transformer

Transformer(
  (encoder): Encoder(
    (sentence_embedding): SentenceEmbedding(
      (embedding): Embedding(72, 512)
      (position_encoder): PositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): SequentialEncoder(
      (0): EncoderLayer(
        (attention): MultiHeadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNormalization()
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm2): LayerNormalization()
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): EncoderLayer(
        (attention): MultiHeadAt

In [None]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):

    def __init__(self, english_sentences, bangla_sentences):
        self.english_sentences = english_sentences
        self.bangla_sentences = bangla_sentences

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        return self.english_sentences[idx], self.bangla_sentences[idx]

In [None]:
dataset = TextDataset(english_sentences, bangla_sentences)

In [None]:
len(dataset)

945

In [None]:
dataset[2]

('"look!"', 'দেখ!')

In [None]:
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)

In [None]:
for batch_num, batch in enumerate(iterator):
    print(batch)
    if batch_num > 3:
        break

[('let me pass."', 'but ... how?"', '"look!"', 'three ...', '"tense?"', 'ever!"', '"really?', '"hum!', '"a fiend!', 'ten minutes?', '"sophie!"', 'see?"', '"is there no one here?"', 'everyone is in danger?', 'now!', '"sophie?"', 'why?"', '"who?"', '"and ... that?"', 'or between china and japan?', '"d\'you know what that means?"', '"meaning?"', "'murder!'", '"what?"', '91 "here you are, holmes!', 'easy?', 'ha!', '"when?"', '"me!"', '"no!"', '"what!', 'why?', '"are you there?"', 's142 to s152 ...', 'here ...', '"no?"', '"what?', 'go!', '"someone\'s coming!"', '"ah!', '"and what?"', '"no, i did not.', '"what!', "what do you want?'", '"fire?', 'what\'s the address?"', '"look!', 'faces.15', "'the a.b.c.?", 'why?', 'trees!', '"oui?"', '"what?"', '"what\'s this?"', 'nuts?', 'when will i reach the corner?', 'are you okay?"', '"you or he?"', 'or of tokay?', 'exactly!', '"what?"', '"no, sir, never!"', '"no?', '"it is?"'), ('যেতে দিন!', 'কীভাবে?', 'দেখ!', 'তিন...', 'টেনশন?', 'কখনও!', 'তাই নাকি?', 

In [None]:
from torch import nn

criterian = nn.CrossEntropyLoss(ignore_index=bangla_to_index[PADDING_TOKEN],
                                reduction='none')

# When computing the loss, we are ignoring cases when the label is the padding token
for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
NEG_INFTY = -1e9

def create_masks(eng_batch, bn_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.full([max_sequence_length, max_sequence_length] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)

    for idx in range(num_sentences):
      eng_sentence_length, bn_sentence_length = len(eng_batch[idx]), len(bn_batch[idx])
      eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_sequence_length)
      bn_chars_to_padding_mask = np.arange(bn_sentence_length + 1, max_sequence_length)
      encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
      encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, bn_chars_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, bn_chars_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, bn_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

In [None]:
def get_device():
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
transformer.train()
transformer.to(device)
total_loss = 0
num_epochs = 30

for epoch in range(num_epochs):
    print(f"Epoch {epoch}")
    iterator = iter(train_loader)
    for batch_num, batch in enumerate(iterator):
        transformer.train()
        eng_batch, bn_batch = batch
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_batch, bn_batch)
        optim.zero_grad()
        bn_predictions = transformer(eng_batch,
                                     bn_batch,
                                     encoder_self_attention_mask.to(device),
                                     decoder_self_attention_mask.to(device),
                                     decoder_cross_attention_mask.to(device),
                                     enc_start_token=False,
                                     enc_end_token=False,
                                     dec_start_token=True,
                                     dec_end_token=True)
        labels = transformer.decoder.sentence_embedding.batch_tokenize(bn_batch, start_token=False, end_token=True)
        loss = criterian(
            bn_predictions.view(-1, bn_vocab_size).to(device),
            labels.view(-1).to(device)
        ).to(device)
        valid_indicies = torch.where(labels.view(-1) == bangla_to_index[PADDING_TOKEN], False, True)
        loss = loss.sum() / valid_indicies.sum()
        loss.backward()
        optim.step()
        #train_losses.append(loss.item())
        if batch_num % 100 == 0:
            print(f"Iteration {batch_num} : {loss.item()}")
            print(f"English: {eng_batch[0]}")
            print(f"Bangla Translation: {bn_batch[0]}")
            bn_sentence_predicted = torch.argmax(bn_predictions[0], axis=1)
            predicted_sentence = ""
            for idx in bn_sentence_predicted:
              if idx == bangla_to_index[END_TOKEN]:
                break
              predicted_sentence += index_to_bangla[idx.item()]
            print(f"Bangla Prediction: {predicted_sentence}")


            transformer.eval()
            bn_sentence = ("",)
            eng_sentence = ("should we go to the mall?",)
            for word_counter in range(max_sequence_length):
                encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= create_masks(eng_sentence, bn_sentence)
                predictions = transformer(eng_sentence,
                                          bn_sentence,
                                          encoder_self_attention_mask.to(device),
                                          decoder_self_attention_mask.to(device),
                                          decoder_cross_attention_mask.to(device),
                                          enc_start_token=False,
                                          enc_end_token=False,
                                          dec_start_token=True,
                                          dec_end_token=False)
                next_token_prob_distribution = predictions[0][word_counter] # not actual probs
                next_token_index = torch.argmax(next_token_prob_distribution).item()
                next_token = index_to_bangla[next_token_index]
                bn_sentence = (bn_sentence[0] + next_token, )
                if next_token == END_TOKEN:
                  break

            print(f"Evaluation translation (should we go to the mall?) : {bn_sentence}")
            print("-------------------------------------------")

Epoch 0
Iteration 0 : 5.764863014221191
English: let me pass."
Bangla Translation: যেতে দিন!
Bangla Prediction: |>>চ্ছ্বচ্ছ্বপ>চ্ছ্বচ্ছ্ব>জ্ঞচ্ছ্বন্ত্রন্ত্রন্ত্র৬দ্দজ্ঞন্ত্রজ্ঞজ্ঞমন্ত্রজ্ঞ%ন্ত্রজ্ঞজ্ঞন্ত্রচ্ছ্ব|ন্ত্রন্ত্রজ্ঞন্ত্রন্ত্রন্ত্রংউন্ত্রদ্দজ্ঞন্ত্রজ্ঞজ্ঞন্বন্ত্রধজ্ঞজ্ঞন্ত্রজ্ঞন্বস্তজ্ঞন্ত্রন্ত্রন্ত্রন্ত্রন্ত্রদ্দন্ত্রচ্ছ্বজ্ঞজ্ঞজ্ঞচ্ছ্বস্তজ্ঞদ্দস্তচ্ছ্ব%ন্বন্ত্রমন্ত্রন্ত্রজ্ঞমচ্ছ্বজ্ঞদ্দজ্ঞন্তন্বন্ত্রজ্ঞচ্ছ্বজ্ঞন্বন্ত্রজ্ঞ'স্তজ্ঞন্ত্রস্তস্তজ্ঞ+জ্ঞন্ত্রদ্দজ্ঞ>|ন্ত্রদ্দ|ন্ত্রজ্ঞস্তচ্ছ্বস্তজ্ঞজ্ঞন্ত্রন্ত্রচ্ছ্বচ্ছ্বন্ত্রন্বজ্ঞজ্ঞন্ত্রন্ত্রন্ত্রজ্ঞজ্ঞজ্ঞউজ্ঞন্ত্রচ্ছ্বজ্ঞজ্ঞস্তন্ত্রন্ত্রজ্ঞজ্ঞজ্ঞন্ত্রন্ত্র+জ্ঞজ্ঞজ্ঞদ্দন্ত্রজ্ঞজ্ঞ%স্তস্তজ্ঞজ্ঞজ্ঞন্ত্র%ন্ত্রন্ত্রন্ত্রন্ত্রন্বন্ত্রচ্ছ্বন্ত্রন্বন্ত্রন্ত্রস্তন্ত্রজ্ঞজ্ঞন্বদ্দজ্ঞচ্ছ্বন্ত্রদ্দন্ত্রজ্ঞজ্ঞন্ত্রন্ত্রজ্ঞন্ত্রচ্ছ্বন্ত্রজ্ঞজ্ঞদ্দক্রজ্ঞন্ত্রন্ত্রউন্ত্র|(মন্ত্রন্ত্রন্ত্রস্ত|ন্ত্রজ্ঞন্বদ্দমন্ত্রজ্ঞন্ত্রন্ত্রচ্ছ্বস্তন্ত্রজ্ঞদ্দজ্ঞ$চ্ছ্বজ্ঞজ্ঞজ্ঞদ্দজ্ঞজ্ঞন্ত্রজ্ঞন্ত্রন্ত্র$চ্ছ্বস্তন্বজ্ঞন্ত্রন্ত্রজ্ঞচ্ছ্বন্ত্রমন্ত্র$ন্ত্রস্ত
Evaluat

In [None]:
transformer.eval()
def translate(eng_sentence):
  eng_sentence = (eng_sentence,)
  bn_sentence = ("",)
  for word_counter in range(max_sequence_length):
    encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= create_masks(eng_sentence, bn_sentence)
    predictions = transformer(eng_sentence,
                              bn_sentence,
                              encoder_self_attention_mask.to(device),
                              decoder_self_attention_mask.to(device),
                              decoder_cross_attention_mask.to(device),
                              enc_start_token=False,
                              enc_end_token=False,
                              dec_start_token=True,
                              dec_end_token=False)
    next_token_prob_distribution = predictions[0][word_counter]
    next_token_index = torch.argmax(next_token_prob_distribution).item()
    next_token = index_to_bangla[next_token_index]
    bn_sentence = (bn_sentence[0] + next_token, )
    if next_token == END_TOKEN:
      break
  return bn_sentence[0]

In [None]:
translation = translate("how are you?")
print(translation)

এটাই কেনে?<END>


In [None]:
translation = translate("let me go")
print(translation)

কেই কে?<END>


In [None]:
translation = translate("do not come here")
print(translation)

এটাই কেনে?<END>


In [None]:
translation = translate("i will go to the market")
print(translation)

এটানে কেনেনে?<END>


In [None]:
translation = translate("where are you going?")
print(translation)

এটানে কেনে?<END>


In [None]:
translation = translate("let me pass")
print(translation)

এটাই কেনে?<END>
