In [156]:
import torch
from torch import nn

In [157]:
batch_size = 64
max_seq_len = 128
d_model = 512
num_heads = 8
dropout = 0.1

## Positional Encoding

In [158]:
class PostionalEncoding(nn.Module):
    def __init__(self,max_seq_len,d_model):
        super(PostionalEncoding,self).__init__()
        self.d_model = d_model
        self.max_seq_len = max_seq_len
        self.encoding = torch.zeros(max_seq_len,d_model)
    
    def forward(self):
        even_index = torch.arange(0,self.d_model,2).float()
        domenator = torch.pow(10000,even_index/self.d_model)
        position = torch.arange(0,self.max_seq_len).unsqueeze(1)
        PE_even = torch.sin(position/domenator)
        PE_odd = torch.cos(position/domenator)
        stacked = torch.stack([PE_even,PE_odd],dim=2)
        PE_flatten = torch.flatten(stacked,start_dim=1,end_dim=2)
        return PE_flatten

In [159]:
START_TOKEN = '<START>'
PADDING_TOKEN = '<PADDING>'
END_TOKEN = '<END>' 

## all characters in the arabic language + symbols
arabic_voc = [START_TOKEN,'ا','ب','ت','ث','ج','ح','خ','د','ذ','ر','ز','س','ش','ص','ض','ط','ظ','ع','غ','ف','ق','ك','ل','م','ن','ه','و','ي','ء','آ','أ','ؤ','إ','ئ','ئ','ة','ـ','،','؛','؟','٠','١','٢','٣','٤','٥','٦','٧','٨','٩','٪','٫','٬','٭','ٮ','ٯ','ٰ','ٱ','ٲ','ٳ','ٴ','ٵ','ٶ','ٷ','ٸ','ٹ','ٺ','ٻ','ټ','ٽ','پ','ٿ','ڀ','ځ','ڂ','ڃ','ڄ','څ','چ','ڇ','ڈ','ډ','ڊ','ڋ','ڌ','ڍ','ڎ','ڏ','ڐ','ڑ','ڒ','ړ','ڔ','ڕ','ږ','ڗ','ژ','ڙ','ښ','ڛ','ڜ','ڝ','ڞ','ڟ','ڠ','ڡ','ڢ','ڣ','ڤ','ڥ','ڦ','ڧ','ڨ','ک','ڪ','ګ','ڬ','ڭ','ڮ','گ','ڰ','ڱ','ڲ','ڳ','ڴ','ڵ','ڶ','ڷ','ڸ','ڹ','ں','ڻ','ڼ','ڽ','ھ','ڿ','ۀ','ہ','ۂ','ۃ','ۄ','ۅ','ۆ','ۇ','ۈ','ۉ','ۊ','ۋ','ی','ۍ','ێ','ۏ','ې','ۑ','ے','ۓ','۔','ە','ۖ','ۗ','ۘ','ۙ','ۚ','ۛ','ۜ','۝','۞','۟','۠','ۡ','ۢ','ۣ','ۤ','ۥ','ۦ','ۧ',PADDING_TOKEN,END_TOKEN]
## all characters in the english language + symbols
english_voc = [START_TOKEN, 'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','0','1','2','3','4','5','6','7','8','9','!','"','#','$','%','&',"'",'(',')','*','+',',','-','.','/',':',';','<','=','>','?','@','[','\\',']','^','_','`','{','|','}','~',' ','\t','\n','\r','\x0b','\x0c',PADDING_TOKEN,END_TOKEN]

In [160]:
index_to_english = {k:v for k,v in enumerate(english_voc)}
english_to_index = {v:k for k,v in enumerate(english_voc)}
index_to_arabic = {k:v for k,v in enumerate(arabic_voc)}
arabic_to_index = {v:k for k,v in enumerate(arabic_voc)}

## Sentence Embedding

In [161]:
class SentenceEmbedding(nn.Module):
    def __init__(self, language_to_index, max_seq_len, d_model):
        super(SentenceEmbedding,self).__init__()
        self.vocab_size = len(language_to_index)
        self.max_seq_len = max_seq_len
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = PostionalEncoding(max_seq_len,d_model)
        self.dropout = nn.Dropout(p=dropout)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN
       
    def batch_tokenize(self, batch, start_token=True, end_token=True):

        def tokenize(sentence, start_token=True, end_token=True):
            sentence_word_indicies = [self.language_to_index[token] for token in list(sentence)]
            if start_token:
                sentence_word_indicies.insert(0, self.language_to_index[self.START_TOKEN])
            if end_token:
                sentence_word_indicies.append(self.language_to_index[self.END_TOKEN])
            for _ in range(len(sentence_word_indicies), self.max_seq_len):
                sentence_word_indicies.append(self.language_to_index[self.PADDING_TOKEN])
            return torch.tensor(sentence_word_indicies)

        tokenized = []
        for sentence_num in range(len(batch)):
           tokenized.append( tokenize(batch[sentence_num], start_token, end_token) )
        
        tokenized = torch.stack(tokenized)
        return tokenized
    
    def forward(self, x, start_token= True, end_token=True): 
        x = self.batch_tokenize(x ,start_token=start_token, end_token=end_token)
        x = self.embedding(x)
        pos = self.position_encoder()
        x = self.dropout(x + pos)
        return x

In [162]:
tokenizer = SentenceEmbedding(english_to_index, max_seq_len, d_model)
print(tokenizer(['hello world']))

tensor([[[-0.4711,  3.5255,  0.9527,  ...,  0.0000,  0.0000,  0.7886],
         [-0.5839,  0.3850,  1.0791,  ...,  2.7149, -1.7831,  1.4967],
         [ 1.2782, -1.5748, -0.3536,  ...,  0.2524, -0.5312,  0.0000],
         ...,
         [-0.0691,  0.0000,  3.5747,  ...,  0.1444,  0.2052,  0.5213],
         [ 0.0000,  1.0412,  3.4578,  ...,  0.1444,  0.2053,  0.5213],
         [ 1.6961,  0.2505,  0.0000,  ...,  0.1444,  0.2054,  0.5213]]],
       grad_fn=<MulBackward0>)
