In [602]:
import torch
import torch.nn as nn
import numpy as np
import math
import torch.nn.functional as F
import pandas as pd
from torch.utils.data import Dataset, DataLoader

In [603]:
batch_size = 32
max_seq_len = 45
input_dim = 512
d_model = 512
num_heads = 8
ffn_hidden = 2048
num_layers = 6
dropout_rate = 0.2

## Multihead Self Attention

In [604]:
def scaled_dot_product_attention(q,k,v,mask=None):
    d_k = q.size(-1)
    qk = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    if mask is not None and mask is True:
        mask = torch.full(qk.size(),fill_value= float('-inf'))
        mask = torch.triu(mask, diagonal=1)
        qk = qk + mask 
    qk = F.softmax(qk, dim=-1)
    new_qkv = torch.matmul(qk, v)
    return new_qkv

class Multihead_Self_Attention(nn.Module):
    def __init__(self,input_dim, d_model, num_heads):
        super(Multihead_Self_Attention, self).__init__()
        self.input_dim = input_dim
        self.model_dim = d_model
        self.num_heads = num_heads
        self.head_dim = self.model_dim // self.num_heads
        self.qkv_layer = nn.Linear(input_dim, 3 * self.model_dim)
        self.concat_layer = nn.Linear(self.model_dim, self.model_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self,x,mask=None):
        qkv = self.qkv_layer(x)
        qkv = qkv.view(batch_size,max_seq_len,self.num_heads,3*self.head_dim)
        qkv = qkv.permute(0,2,1,3)
        q,k,v = qkv.chunk(3,dim=-1)
        new_qkv = scaled_dot_product_attention(q,k,v,mask)
        new_qkv = new_qkv.view(batch_size,max_seq_len,self.model_dim)
        out = self.concat_layer(new_qkv)
        return out

## Multihead Cross Attention

In [605]:
class Multihead_Cross_Attention(nn.Module):
    def __init__(self,input_dim, model_dim, num_heads):
        super(Multihead_Cross_Attention, self).__init__()
        self.input_dim = input_dim
        self.model_dim = model_dim
        self.num_heads = num_heads
        self.head_dim = model_dim // num_heads
        self.qk_layer = nn.Linear(input_dim, 2 * model_dim)
        self.v_layer = nn.Linear(input_dim, model_dim)
        self.concat_layer = nn.Linear(model_dim, model_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self,x,y,mask=None):
        qk = self.qk_layer(x)
        v = self.v_layer(y)
        qk = qk.view(batch_size,max_seq_len,num_heads,2*self.head_dim)
        v = v.view(batch_size,max_seq_len,num_heads,self.head_dim)
        qk = qk.permute(0,2,1,3)
        v = v.permute(0,2,1,3)
        q,k = qk.chunk(2,dim=-1)
        new_qkv = scaled_dot_product_attention(q,k,v,mask)
        new_qkv = new_qkv.view(batch_size,max_seq_len,self.model_dim)
        out = self.concat_layer(new_qkv)
        return out

## Positional Encoding

In [606]:
class PostionalEncoding(nn.Module):
    def __init__(self,max_seq_len,d_model):
        super(PostionalEncoding,self).__init__()
        self.max_seq_len = max_seq_len
        self.d_model = d_model
        self.encoding = torch.zeros(self.max_seq_len,self.d_model)
    
    def forward(self,x):
        even_index = torch.arange(0,self.d_model,2).float()
        domenator = torch.pow(10000,even_index/self.d_model)
        position = torch.arange(0,self.max_seq_len).unsqueeze(1)
        PE_even = torch.sin(position/domenator)
        PE_odd = torch.cos(position/domenator)
        stacked = torch.stack([PE_even,PE_odd],dim=2)
        PE_flatten = torch.flatten(stacked,start_dim=1,end_dim=2)
        return PE_flatten

## Normalization Layer

In [607]:
class NormalizationLayer(nn.Module):
    def __init__(self, parameter_dim):
        super(NormalizationLayer, self).__init__()
        self.parameters_shape = parameter_dim
        self.gamma = nn.Parameter(torch.ones(parameter_dim))
        self.beta = nn.Parameter(torch.zeros(parameter_dim))
        self.eps = 1e-6

    def forward(self, x):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = x.mean(dims, keepdim=True)
        std = x.std(dims, keepdim=True)
        out = self.gamma * (x - mean) / (std + self.eps) + self.beta
        return out

## Position-wise Feed-Forward

In [608]:
class FeedForward(nn.Module):
    def __init__(self,d_model,ffn_hidden):
        super(FeedForward,self).__init__()
        self.d_model = d_model
        self.ffn_hidden = ffn_hidden
        self.layer1 = nn.Linear(self.d_model,self.ffn_hidden)
        self.layer2 = nn.Linear(self.ffn_hidden,self.d_model)
        self.dropout = nn.Dropout(p=dropout_rate)

    def forward(self,x):
        x = self.layer1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.layer2(x)
        return x

## Encoder

In [609]:
class Encoder_Layer(nn.Module):
    def __init__(self,num_heads, d_model, ffn_hidden, max_seq_len, dropout_rate):
        super(Encoder_Layer,self).__init__()
        self.multihead_attention = Multihead_Self_Attention(input_dim,d_model,num_heads)
        self.pos_encoding = PostionalEncoding(max_seq_len,d_model)
        self.feedforward = FeedForward(d_model,ffn_hidden)
        self.norm1 = NormalizationLayer([d_model])
        self.norm2 = NormalizationLayer([d_model])
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)
    
    def forward(self,x):
        reseduial_x = x
        x = self.multihead_attention(x)
        x = self.dropout1(x)
        x = x + reseduial_x
        x = self.norm1(x)
        reseduial_x = x
        x = self.feedforward(x)
        x = self.dropout2(x)
        x = x + reseduial_x
        x = self.norm2(x)
        return x

In [610]:
class Ecoder(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, dropout_rate, num_layers):
        super().__init__()
        self.layers = nn.Sequential(*[Encoder_Layer(num_heads, d_model, ffn_hidden, max_seq_len, dropout_rate)
                                     for _ in range(num_layers)])
    def forward(self, x):
        x = self.layers(x)
        return x

## Decoder

In [611]:
class Decoder_Layer(nn.Module):
    def __init__(self,num_heads, d_model, ffn_hidden, max_seq_len, dropout_rate):
        super(Decoder_Layer,self).__init__()
        self.multihead_self_attention = Multihead_Self_Attention(input_dim,d_model,num_heads)
        self.multihead_cross_attention = Multihead_Cross_Attention(input_dim,d_model,num_heads)
        self.feedforward = FeedForward(d_model,ffn_hidden)
        self.norm1 = NormalizationLayer([d_model])
        self.norm2 = NormalizationLayer([d_model])
        self.norm3 = NormalizationLayer([d_model])
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.dropout3 = nn.Dropout(dropout_rate)
    
    def forward(self, x, y):
        reseduial_y = y
        y = self.multihead_self_attention(y,mask = True)
        y = self.dropout1(y)
        y = y + reseduial_y
        y = self.norm1(y)

        reseduial_y = y
        y = self.multihead_cross_attention(x,y)
        y = self.dropout2(y)
        y = y + reseduial_y
        y = self.norm2(y)

        reseduial_y = y
        y = self.feedforward(y)
        y = self.dropout3(y)
        y = y + reseduial_y
        y = self.norm3(y)

        return y

In [612]:
class Sequential_Decoder(nn.Sequential):
    def forward(self,*input):
        x, y = input
        for module in self._modules.values():
            y = module(x, y) 
        return y

In [613]:
class Decoder(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, dropout_rate, num_layers):
        super().__init__()
        self.layers = Sequential_Decoder(*[Decoder_Layer(num_heads, d_model, ffn_hidden, max_seq_len, dropout_rate)
                                     for _ in range(num_layers)])
    def forward(self, x, y):
        y = self.layers(x, y)
        return y

## Tokenization

In [614]:
START_TOKEN = '<START>'
PADDING_TOKEN = '<PADDING>'
END_TOKEN = '<END>' 

## all characters in the arabic language + symbols
arabic_voc = [START_TOKEN,'ا','ب','ت','ث','ج','ح','خ','د','ذ','ر','ز','س','ش','ص','ض','ط','ظ','ع','غ','ف','ق','ك','ل','م','ن','ه','و','ي','ء','آ','أ','ؤ','إ','ئ','ئ','ة','ـ','،','؛','؟','٠','١','٢','٣','٤','٥','٦','٧','٨','٩','٪','٫','٬','٭','ٮ','ٯ','ٰ','ٱ','ٲ','ٳ','ٴ','ٵ','ٶ','ٷ','ٸ','ٹ','ٺ','ٻ','ټ','ٽ','پ','ٿ','ڀ','ځ','ڂ','ڃ','ڄ','څ','چ','ڇ','ڈ','ډ','ڊ','ڋ','ڌ','ڍ','ڎ','ڏ','ڐ','ڑ','ڒ','ړ','ڔ','ڕ','ږ','ڗ','ژ','ڙ','ښ','ڛ','ڜ','ڝ','ڞ','ڟ','ڠ','ڡ','ڢ','ڣ','ڤ','ڥ','ڦ','ڧ','ڨ','ک','ڪ','ګ','ڬ','ڭ','ڮ','گ','ڰ','ڱ','ڲ','ڳ','ڴ','ڵ','ڶ','ڷ','ڸ','ڹ','ں','ڻ','ڼ','ڽ','ھ','ڿ','ۀ','ہ','ۂ','ۃ','ۄ','ۅ','ۆ','ۇ','ۈ','ۉ','ۊ','ۋ','ی','ۍ','ێ','ۏ','ې','ۑ','ے','ۓ','۔','ە','ۖ','ۗ','ۘ','ۙ','ۚ','ۛ','ۜ','۝','۞','۟','۠','ۡ','ۢ','ۣ','ۤ','ۥ','ۦ','ۧ',' ','!','"','#','$','%','&',"'",'(',')','*','+',',','-','.','/',':',';','<','=','>','?','@','[','\\',']','^','_','`','{','|','}','~',' ','\t','\n','\r','\x0b','\x0c',PADDING_TOKEN,END_TOKEN]
## all characters in the english language + symbols
english_voc = [START_TOKEN, 'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','0','1','2','3','4','5','6','7','8','9','!','"','#','$','%','&',"'",'(',')','*','+',',','-','.','/',':',';','<','=','>','?','@','[','\\',']','^','_','`','{','|','}','~',' ','\t','\n','\r','\x0b','\x0c',PADDING_TOKEN,END_TOKEN]

In [615]:
index_to_english = {k:v for k,v in enumerate(english_voc)}
english_to_index = {v:k for k,v in enumerate(english_voc)}
index_to_arabic = {k:v for k,v in enumerate(arabic_voc)}
arabic_to_index = {v:k for k,v in enumerate(arabic_voc)}

In [616]:
class SentenceEmbedding(nn.Module):
    def __init__(self, language_to_index, max_seq_len, d_model):
        super(SentenceEmbedding,self).__init__()
        self.vocab_size = len(language_to_index)
        self.max_seq_len = max_seq_len
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = PostionalEncoding(max_seq_len,d_model)
        self.dropout = nn.Dropout(p=dropout_rate)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN
       
    def batch_tokenize(self, batch, start_token=True, end_token=True):

        def tokenize(sentence, start_token=True, end_token=True):
            sentence_word_indicies = [self.language_to_index[token] for token in list(sentence)]
            if start_token:
                sentence_word_indicies.insert(0, self.language_to_index[self.START_TOKEN])
            if end_token:
                sentence_word_indicies.append(self.language_to_index[self.END_TOKEN])
            for _ in range(len(sentence_word_indicies), self.max_seq_len):
                sentence_word_indicies.append(self.language_to_index[self.PADDING_TOKEN])
            return torch.tensor(sentence_word_indicies)

        tokenized = []
        for sentence_num in range(len(batch)):
           tokenized.append( tokenize(batch[sentence_num], start_token, end_token) )
        
        tokenized = torch.stack(tokenized)
        return tokenized
    
    def forward(self, x, start_token= True, end_token=True): 
        x = self.batch_tokenize(x ,start_token=start_token, end_token=end_token)
        x = self.embedding(x)
        pos = self.position_encoder()
        x = self.dropout(x + pos)
        return x

In [617]:
NEG_INFTY = -1e9

def create_masks(eng_batch, ar_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.full([max_seq_len, max_seq_len] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_seq_len, max_seq_len] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_seq_len, max_seq_len] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_seq_len, max_seq_len] , False)

    for idx in range(num_sentences):
      eng_sentence_length, ar_sentence_length = len(eng_batch[idx]), len(ar_batch[idx])
      eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_seq_len)
      ar_chars_to_padding_mask = np.arange(max_seq_len + 1, max_seq_len)
      encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
      encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, ar_chars_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, ar_chars_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, ar_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

## Loading The Dataset

In [618]:
df = pd.read_csv('dataset/en_ar_final.tsv', sep='\t', header=0)

In [619]:
print(df.shape)
df.head()

(1325899, 2)


Unnamed: 0,en,ar
0,and this,و هذه؟
1,it was um,...لقد كان
2,what is she doing here,ما الذي تفعله هناك؟
3,i dont like it,لا أحب ذلك
4,did you get the part,هل حصلت على جزء ?


In [620]:
class TextDataset(Dataset):
    def __init__(self, english_sentences, arabic_sentences):
        self.english_sentences = english_sentences
        self.arabic_sentences = arabic_sentences

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        return self.english_sentences[idx], self.arabic_sentences[idx]

## Data Processing

In [621]:
df = df.dropna()
# drop duplicates
df = df.drop_duplicates()
# drop rows with empty strings
df = df[(df['en'] != '') & (df['ar'] != '')]

In [622]:
english_sentences = df['en'].values
arabic_sentences = df['ar'].values

In [623]:
english_sentences

array(['and this', 'it was um', 'what is she doing here', ...,
       'chinas quest for value', 'zimbabweu0027s last chance',
       'zuma rising'], dtype=object)

In [624]:
arabic_sentences

array(['و هذه؟', '...لقد كان', 'ما الذي تفعله هناك؟', ...,
       'الصين تفتش عن القيمة', 'زيمبابوي والفرصة الأخيرة',
       'صعود نجم زوما'], dtype=object)

In [625]:
print(english_sentences[0],arabic_sentences[0])

and this و هذه؟


In [626]:
max(len(x) for x in arabic_sentences), max(len(x) for x in english_sentences)

(32, 242)

In [627]:
PERCENTILE = 91
print( f"{PERCENTILE}th percentile length Arabic: {np.percentile([len(x) for x in arabic_sentences], PERCENTILE)}" )
print( f"{PERCENTILE}th percentile length English: {np.percentile([len(x) for x in english_sentences], PERCENTILE)}" )

91th percentile length Arabic: 28.0
91th percentile length English: 44.0


In [628]:
# top = 1000000
# english_sentences = english_sentences[:top]
# arabic_sentences = arabic_sentences[:top]

In [629]:
def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

def is_valid_length(sentence, max_seq_len):
    return len(list(sentence)) < (max_seq_len - 1) 

valid_sentence_indicies = []
for index in range(len(arabic_sentences)):
    arabic_sentence, english_sentence = arabic_sentences[index], english_sentences[index]
    if is_valid_length(arabic_sentence, max_seq_len) \
        and is_valid_length(english_sentence, max_seq_len) \
        and is_valid_tokens(arabic_sentence, arabic_voc) \
        and is_valid_tokens(english_sentence, english_voc) :
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(arabic_sentences)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")

Number of sentences: 831789
Number of valid sentences: 547389


In [630]:
arabic_sentences = [arabic_sentences[i] for i in valid_sentence_indicies]
english_sentences = [english_sentences[i] for i in valid_sentence_indicies]

In [631]:
df = pd.DataFrame({'en': english_sentences, 'ar': arabic_sentences})
df.to_csv('dataset/updated_en_ar_final.csv', index=False)

In [632]:
dataset = TextDataset(english_sentences, arabic_sentences)

In [635]:
len(dataset)
dataset[0]

('and this', 'و هذه؟')