In [5]:
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
import numpy as np
!pip install transformers
from transformers import TFAutoModel


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
file = 'ara_eng.txt'

data = open('ara_eng.txt', "r", encoding = "UTF-8")
data = data.read().split('\n')

# Splitting the text into (English, Arabic) pairs
for i in range(0, len(data)):
  data[i] = data[i].split('\t')
  data[i] = list(map(str.lower, data[i]))

data[-1]

data.pop()


['']

In [7]:
# Storing the English and Arabic sentences in different lists
eng = [row[0] for row in data]
ara = [row[1] for row in data]

In [8]:
english_sentences = eng[:10000]
arabic_sentences = ara[:10000]
# Tokenizing the English sentences
english_tokenizer = Tokenizer()
english_tokenizer.fit_on_texts(english_sentences)
sequences_english = english_tokenizer.texts_to_sequences(english_sentences)

# Tokenizing the Arabic sentences
arabic_tokenizer = Tokenizer()
arabic_tokenizer.fit_on_texts(arabic_sentences)
sequences_arabic = arabic_tokenizer.texts_to_sequences(arabic_sentences)


# Finding the vocabulary size of English and Arabic
vocab_size_english = len(english_tokenizer.word_index) + 1
vocab_size_arabic = len(arabic_tokenizer.word_index) + 1

# Finding the maximum length of English and Arabic sentences
max_length_english = max(len(seq) for seq in sequences_english)
max_length_arabic = max(len(seq) for seq in sequences_arabic)


# Padding the English and Arabic sequences
padded_english = pad_sequences(sequences_english, maxlen = max_length_english, padding = 'post')
padded_arabic = pad_sequences(sequences_arabic, maxlen = max_length_arabic, padding = 'post')

print(f"English Vocabulary Size: {vocab_size_english}")
print(f"Arabic Vocabulary Size: {vocab_size_arabic}")
print(f"English Max Length: {max_length_english}")
print(f"Arabic Max Length: {max_length_arabic}")


English Vocabulary Size: 3631
Arabic Vocabulary Size: 10520
English Max Length: 11
Arabic Max Length: 14


In [9]:
# Reshaping the English and Arabic sequences 
padded_english = padded_english.reshape(*padded_english.shape, 1)
padded_arabic = padded_arabic.reshape(*padded_arabic.shape, 1)

print(padded_english.shape)
print(padded_arabic.shape)

(10000, 11, 1)
(10000, 14, 1)


In [10]:
input_sequence = Input(shape=(max_length_english,), dtype='int32')
embedding = Embedding(input_dim=vocab_size_english, output_dim=128)(input_sequence)

transformer_model = TFAutoModel.from_pretrained("bert-base-uncased")

encoder_outputs = transformer_model(input_sequence)[0]

decoder_inputs = Input(shape=(max_length_arabic,), dtype='int32')
decoder_embedding = Embedding(input_dim=vocab_size_arabic, output_dim=128)(decoder_inputs)

encoder_lstm = LSTM(64, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(embedding)

decoder_lstm = LSTM(64, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

decoder_dense = Dense(vocab_size_arabic, activation='softmax')
output = decoder_dense(decoder_outputs)

enc_dec_model = Model([input_sequence, decoder_inputs], output)
optimizer = Adam(learning_rate=0.001)
enc_dec_model.compile(optimizer=optimizer, loss=sparse_categorical_crossentropy, metrics=['accuracy'])
enc_dec_model.summary()

batch_size = 32
num_batches = len(padded_english) // batch_size

val_size = 1000
eng_pad_val = padded_english[-val_size:]
ara_pad_val = padded_arabic[-val_size:]


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 11)]         0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 14)]         0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 11, 128)      464768      ['input_3[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, 14, 128)      1346560     ['input_4[0][0]']                
                                                                                            

In [11]:

num_epochs = 15
for epoch in range(num_epochs):
    total_loss = 0
    total_accuracy = 0
    
    for batch in range(num_batches):
        indices = np.random.choice(len(padded_english), size=batch_size, replace=False)
        eng_batch = padded_english[indices]
        ara_batch = padded_arabic[indices]

        loss, accuracy = enc_dec_model.train_on_batch([eng_batch, ara_batch], ara_batch)

        total_loss += loss
        total_accuracy += accuracy
    
    # Calculate validation loss and accuracy
    val_loss, val_accuracy = enc_dec_model.evaluate([eng_pad_val, ara_pad_val], ara_pad_val, verbose=0)
    avg_loss = total_loss / num_batches
    avg_accuracy = total_accuracy / num_batches

    print("Epoch: {}/{} - Avg. Loss: {:.4f} - Avg. Accuracy: {:.4f} - Val Loss: {:.4f} - Val Accuracy: {:.4f}".format(
        epoch + 1, num_epochs, avg_loss, avg_accuracy, val_loss, val_accuracy))

# Function to convert logits to a sentence
def logits_to_sentence(logits, tokenizer):
    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = ''
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

# Example sentence translation
index = 14
print("The English sentence is: {}".format(english_sentences[index]))
print("The Arabic sentence is: {}".format(arabic_sentences[index]))
print('The predicted Arabic sentence is:')
predicted_sentence = logits_to_sentence(
    enc_dec_model.predict([padded_english[index:index + 1], padded_arabic[index:index + 1]])[0],
    arabic_tokenizer)
print(predicted_sentence)


Epoch: 1/15 - Avg. Loss: 3.3423 - Avg. Accuracy: 0.7079 - Val Loss: 3.3326 - Val Accuracy: 0.5663
Epoch: 2/15 - Avg. Loss: 2.0997 - Avg. Accuracy: 0.7144 - Val Loss: 3.1153 - Val Accuracy: 0.5700
Epoch: 3/15 - Avg. Loss: 1.9902 - Avg. Accuracy: 0.7244 - Val Loss: 2.9838 - Val Accuracy: 0.5866
Epoch: 4/15 - Avg. Loss: 1.8931 - Avg. Accuracy: 0.7301 - Val Loss: 2.8498 - Val Accuracy: 0.5970
Epoch: 5/15 - Avg. Loss: 1.7896 - Avg. Accuracy: 0.7411 - Val Loss: 2.6920 - Val Accuracy: 0.6157
Epoch: 6/15 - Avg. Loss: 1.6744 - Avg. Accuracy: 0.7634 - Val Loss: 2.5085 - Val Accuracy: 0.6579
Epoch: 7/15 - Avg. Loss: 1.5565 - Avg. Accuracy: 0.7831 - Val Loss: 2.3301 - Val Accuracy: 0.6822
Epoch: 8/15 - Avg. Loss: 1.4327 - Avg. Accuracy: 0.8036 - Val Loss: 2.1572 - Val Accuracy: 0.7072
Epoch: 9/15 - Avg. Loss: 1.3137 - Avg. Accuracy: 0.8222 - Val Loss: 1.9996 - Val Accuracy: 0.7341
Epoch: 10/15 - Avg. Loss: 1.2253 - Avg. Accuracy: 0.8364 - Val Loss: 1.8356 - Val Accuracy: 0.7599
Epoch: 11/15 - Avg.