In [1]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model, load_model
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding, Dot, Concatenate, SimpleRNN, Dropout
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
import numpy as np

In [2]:
file = 'ara_eng.txt'

data = open('ara_eng.txt', "r", encoding = "UTF-8")
data = data.read().split('\n')

# Splitting the text into (English, Arabic) pairs
for i in range(0, len(data)):
  data[i] = data[i].split('\t')
  data[i] = list(map(str.lower, data[i]))

data[-1]

data.pop()

['']

In [3]:
# Storing the English and Arabic sentences in different lists
eng = [row[0] for row in data]
ara = [row[1] for row in data]

In [4]:
english_sentences = eng[:10000]
arabic_sentences = ara[:10000]
# Tokenizing the English sentences
english_tokenizer = Tokenizer()
english_tokenizer.fit_on_texts(english_sentences)
sequences_english = english_tokenizer.texts_to_sequences(english_sentences)

# Tokenizing the Arabic sentences
arabic_tokenizer = Tokenizer()
arabic_tokenizer.fit_on_texts(arabic_sentences)
sequences_arabic = arabic_tokenizer.texts_to_sequences(arabic_sentences)


# Finding the vocabulary size of English and Arabic
vocab_size_english = len(english_tokenizer.word_index) + 1
vocab_size_arabic = len(arabic_tokenizer.word_index) + 1

# Finding the maximum length of English and Arabic sentences
max_length_english = max(len(seq) for seq in sequences_english)
max_length_arabic = max(len(seq) for seq in sequences_arabic)


# Padding the English and Arabic sequences
padded_english = pad_sequences(sequences_english, maxlen = max_length_english, padding = 'post')
padded_arabic = pad_sequences(sequences_arabic, maxlen = max_length_arabic, padding = 'post')

print(f"English Vocabulary Size: {vocab_size_english}")
print(f"Arabic Vocabulary Size: {vocab_size_arabic}")
print(f"English Max Length: {max_length_english}")
print(f"Arabic Max Length: {max_length_arabic}")

English Vocabulary Size: 3631
Arabic Vocabulary Size: 10520
English Max Length: 11
Arabic Max Length: 14


In [5]:
# Reshaping the English and Arabic sequences 
padded_english = padded_english.reshape(*padded_english.shape, 1)
padded_arabic = padded_arabic.reshape(*padded_arabic.shape, 1)

print(padded_english.shape)
print(padded_arabic.shape)

(10000, 11, 1)
(10000, 14, 1)


In [6]:
# Defining the encoder-decoder model with attention

# Encoder
encoder_inputs = Input(shape = (max_length_english,))
encoder_embedding = Embedding(input_dim = vocab_size_english, output_dim = 128, input_length = max_length_english, mask_zero = True)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(64, return_sequences = True, return_state = True)(encoder_embedding)

# Decoder
decoder_inputs = Input(shape = (max_length_arabic,))
decoder_embedding = Embedding(input_dim = vocab_size_arabic, output_dim = 128)(decoder_inputs)
decoder_lstm = LSTM(64, return_sequences = True, return_state = True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state = [state_h, state_c])

# Attention Layer
attention = Dot(axes = [2, 2])([decoder_outputs, encoder_outputs])
attention = Activation('softmax')(attention)
context = Dot(axes = [2, 1])([attention, encoder_outputs])
concat_input = Concatenate(axis = -1)([context, decoder_outputs])

# Output Layer
decoder_dense = Dense(vocab_size_arabic, activation='softmax')
output = decoder_dense(concat_input)

# Model
model = Model([encoder_inputs, decoder_inputs], output)

# Compiling the model
lr = 0.01
model.compile(optimizer = Adam(learning_rate = lr), loss = sparse_categorical_crossentropy, metrics = ['accuracy'])

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 11)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 14)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 11, 128)      464768      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 14, 128)      1346560     ['input_2[0][0]']                
                                                                                              

In [7]:
# Training the model
batch_size = 32
num_batches = len(padded_english) // batch_size
test_size = 1000

padded_english_test = padded_english[:-test_size]
padded_arabic_test = padded_arabic[:-test_size]

epochs = 10

for epoch in range(epochs):
    total_loss = 0
    total_accuracy = 0
    
    for batch in range(num_batches):
        start = batch * batch_size
        end = start + batch_size
        english_batch = padded_english[start:end]
        arabic_batch = padded_arabic[start:end]
        
        metrics = model.train_on_batch([english_batch, arabic_batch], arabic_batch)
        loss = metrics[0]
        accuracy = metrics[1]
        
        total_loss += loss
        total_accuracy += accuracy
    
    avg_loss = total_loss / num_batches
    avg_accuracy = total_accuracy / num_batches
    
    test_loss, test_accuracy = model.evaluate([padded_english_test, padded_arabic_test], padded_arabic_test, verbose=0)
    
    print(f"Epoch: {epoch + 1}/{epochs} - Loss: {avg_loss} - Accuracy: {avg_accuracy} - Test Accuracy: {test_accuracy}")

model.save('RNN_ENCODER_DECODER.h5')

Epoch: 1/10 - Loss: 2.1159711678822837 - Accuracy: 0.7570756054841555 - Test Accuracy: 0.827150821685791
Epoch: 2/10 - Loss: 1.13325069214289 - Accuracy: 0.849845466705469 - Test Accuracy: 0.8830793499946594
Epoch: 3/10 - Loss: 0.6807916859785715 - Accuracy: 0.8993532520074111 - Test Accuracy: 0.9137460589408875
Epoch: 4/10 - Loss: 0.3967553907288955 - Accuracy: 0.9336152116839702 - Test Accuracy: 0.9425714015960693
Epoch: 5/10 - Loss: 0.21043648284215194 - Accuracy: 0.9629478582586998 - Test Accuracy: 0.9777539968490601
Epoch: 6/10 - Loss: 0.07248771717795768 - Accuracy: 0.9884744152808801 - Test Accuracy: 0.9970317482948303
Epoch: 7/10 - Loss: 0.019225011117016085 - Accuracy: 0.9970524318707294 - Test Accuracy: 0.9973571300506592
Epoch: 8/10 - Loss: 0.01665861874472541 - Accuracy: 0.9971883640839503 - Test Accuracy: 0.9982619285583496
Epoch: 9/10 - Loss: 0.00961310723654699 - Accuracy: 0.9983258980971116 - Test Accuracy: 0.9994285702705383
Epoch: 10/10 - Loss: 0.003393353548185237 - 

In [8]:
model = load_model('RNN_ENCODER_DECODER.h5')

In [9]:
def logits_to_sentence(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = ''
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [10]:
import random

index = random.randint(0, len(english_sentences) - 1)
print("Random Index:", index)

the_english_sentence = english_sentences[index]
the_arabic_sentence = arabic_sentences[index]
the_predicted_translation = logits_to_sentence(model.predict([padded_english[index:index + 1], padded_arabic[index:index + 1]])[0], arabic_tokenizer)

print("\n")
print(f"English Sentence: {the_english_sentence}")
print(f"Actual Arabic Sentence: {the_arabic_sentence}")
print("\n")
print(f"Predicted Arabic Sentence: {the_predicted_translation}")
print("\n")


Random Index: 2593


English Sentence: he looks suspicious.
Actual Arabic Sentence: يبدو كأنه شخص مثير للشك.


Predicted Arabic Sentence: يبدو كأنه شخص مثير للشك         


