In [1]:
import tensorflow as tf
import numpy as np
import pathlib
import io
import re
import unicodedata
import time
from tqdm import tqdm

In [2]:
!wget https://raw.githubusercontent.com/SamirMoustafa/nmt-with-attention-for-ar-to-en/master/ara_.txt -O ara_.txt
text_file = pathlib.Path('ara_.txt')

--2025-05-05 14:42:53--  https://raw.githubusercontent.com/SamirMoustafa/nmt-with-attention-for-ar-to-en/master/ara_.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 763396 (746K) [text/plain]
Saving to: ‘ara_.txt’


2025-05-05 14:42:53 (102 MB/s) - ‘ara_.txt’ saved [763396/763396]



In [3]:
def unicode_normalize(s):
    return ''.join(c for c in unicodedata.normalize('NFKC', s) if unicodedata.category(c) != 'Mn')

def preprocess_sentence(sentence, lang, is_target=False):
    sentence = unicode_normalize(sentence.strip())
    if lang == 'en':
        sentence = sentence.lower()
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    if lang == 'en':
        sentence = re.sub(r"[^a-zA-Z?.!,¿\d]+", " ", sentence)
    elif lang == 'ar':
        sentence = re.sub(r"[^\u0600-\u06FF\d?.!,¿]+", " ", sentence)
    sentence = sentence.strip()
    if is_target:
        sentence = f'[START] {sentence} [END]'
    return sentence

In [4]:
text = io.open(text_file, encoding='UTF-8').read().strip().split('\n')

sentence_pairs = [line.split('\t') for line in text]
valid_pairs = [pair for pair in sentence_pairs if len(pair) == 2]
if len(valid_pairs) != len(sentence_pairs):
    print(f"Warning: {len(sentence_pairs) - len(valid_pairs)} lines ignored due to incorrect format.")
en_raw = [pair[0].strip() for pair in valid_pairs]
ar_raw = [pair[1].strip() for pair in valid_pairs]

en_processed = [preprocess_sentence(s, lang='en', is_target=True) for s in en_raw]
ar_processed = [preprocess_sentence(s, lang='ar', is_target=False) for s in ar_raw]

ar_lengths = [len(s.split(' ')) for s in ar_processed]
en_lengths = [len(s.split(' ')) for s in en_processed]
max_len_ar = max(ar_lengths)
max_len_en = max(en_lengths)
MAX_SEQUENCE_LENGTH = max(max_len_ar, max_len_en)

In [5]:
input_text_processor = tf.keras.layers.TextVectorization(standardize=None, ragged=True)
output_text_processor = tf.keras.layers.TextVectorization(standardize=None, ragged=True)
input_text_processor.adapt(ar_processed)
output_text_processor.adapt(en_processed)
INPUT_VOCAB_SIZE = input_text_processor.vocabulary_size()
TARGET_VOCAB_SIZE = output_text_processor.vocabulary_size()

BUFFER_SIZE = len(ar_processed)
BATCH_SIZE = 64

In [6]:
def prepare_batch(ar_batch, en_batch):
    ar_seq = input_text_processor(ar_batch)
    en_seq = output_text_processor(en_batch)
    en_input = en_seq[:, :-1]
    en_label = en_seq[:, 1:]
    ar_seq_padded = ar_seq.to_tensor(default_value=0, shape=[None, MAX_SEQUENCE_LENGTH])
    en_input_padded = en_input.to_tensor(default_value=0, shape=[None, MAX_SEQUENCE_LENGTH])
    en_label_padded = en_label.to_tensor(default_value=0, shape=[None, MAX_SEQUENCE_LENGTH])
    return (ar_seq_padded, en_input_padded), en_label_padded

dataset = tf.data.Dataset.from_tensor_slices((ar_processed, en_processed))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).map(prepare_batch, tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

train_size = int(0.9 * len(ar_processed))
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [7]:
def positional_encoding(length, depth):
    depth = depth / 2
    positions = np.arange(length)[:, np.newaxis]
    depths = np.arange(depth)[np.newaxis, :]/depth
    angle_rates = 1 / (10000**depths)
    angle_rads = positions * angle_rates
    pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1)
    return tf.cast(pos_encoding, dtype=tf.float32)

class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super(PositionalEmbedding, self).__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

In [8]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    if mask is not None:
        mask = tf.cast(mask, dtype=scaled_attention_logits.dtype)
        scaled_attention_logits += (mask * -1e9)
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, v)
    return output, attention_weights

In [9]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self,*, d_model, num_heads, dropout_rate=0.1):
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.dropout_rate = dropout_rate
        assert d_model % self.num_heads == 0
        self.depth = d_model // self.num_heads
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        self.dense = tf.keras.layers.Dense(d_model)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        output = self.dense(concat_attention)
        output = self.dropout(output)
        return output, attention_weights

In [10]:
class PointWiseFeedForwardNetwork(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model),
            tf.keras.layers.Dropout(dropout_rate)
        ])

    def call(self, x, training=False):
        return self.seq(x, training=training)

class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model=d_model, num_heads=num_heads, dropout_rate=dropout_rate)
        self.ffn = PointWiseFeedForwardNetwork(d_model=d_model, dff=dff, dropout_rate=dropout_rate)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x, training=False, mask=None):
        attn_output, _ = self.mha(x, x, x, mask=mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        out2 = self.layernorm2(out1 + ffn_output)
        return out2

class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
        super().__init__()
        self.mha1 = MultiHeadAttention(d_model=d_model, num_heads=num_heads, dropout_rate=dropout_rate)
        self.mha2 = MultiHeadAttention(d_model=d_model, num_heads=num_heads, dropout_rate=dropout_rate)
        self.ffn = PointWiseFeedForwardNetwork(d_model=d_model, dff=dff, dropout_rate=dropout_rate)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x, enc_output, training=False, look_ahead_mask=None, padding_mask=None):
        attn1, attn_weights_block1 = self.mha1(x, x, x, mask=look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)
        attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, mask=padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)
        ffn_output = self.ffn(out2)
        out3 = self.layernorm3(ffn_output + out2)
        return out3, attn_weights_block1, attn_weights_block2

In [11]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1):
        super().__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
        self.enc_layers = [EncoderLayer(d_model=d_model, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate)
                          for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x, training=False, mask=None):
        seq_len = tf.shape(x)[1]
        x = self.pos_embedding(x)
        x = self.dropout(x, training=training)
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training=training, mask=mask)
        return x

class Decoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1):
        super().__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
        self.dec_layers = [DecoderLayer(d_model=d_model, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate)
                          for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x, enc_output, training=False, look_ahead_mask=None, padding_mask=None):
        seq_len = tf.shape(x)[1]
        attention_weights = {}
        x = self.pos_embedding(x)
        x = self.dropout(x, training=training)
        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training=training,
                                                  look_ahead_mask=look_ahead_mask,
                                                  padding_mask=padding_mask)
            attention_weights[f'decoder_layer{i+1}_block1'] = block1
            attention_weights[f'decoder_layer{i+1}_block2'] = block2
        return x, attention_weights

In [12]:
class Transformer(tf.keras.Model):
    def __init__(self, *, num_layers, d_model, num_heads, dff,
                 input_vocab_size, target_vocab_size, dropout_rate=0.1):
        super().__init__()
        self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                              num_heads=num_heads, dff=dff,
                              vocab_size=input_vocab_size, dropout_rate=dropout_rate)
        self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                              num_heads=num_heads, dff=dff,
                              vocab_size=target_vocab_size, dropout_rate=dropout_rate)
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inputs, training=False):
        inp, tar = inputs
        enc_padding_mask, look_ahead_mask, dec_padding_mask = create_masks(inp, tar)
        enc_output = self.encoder(inp, training=training, mask=enc_padding_mask)
        dec_output, attention_weights = self.decoder(tar, enc_output, training=training,
                                                    look_ahead_mask=look_ahead_mask,
                                                    padding_mask=dec_padding_mask)
        final_output = self.final_layer(dec_output)
        return final_output, attention_weights

transformer = Transformer(num_layers=4, d_model=128, num_heads=8,
                         dff=512, input_vocab_size=INPUT_VOCAB_SIZE, target_vocab_size=TARGET_VOCAB_SIZE,
                         dropout_rate=0.1)

In [13]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

def create_masks(inp, tar):
    enc_padding_mask = create_padding_mask(inp)
    dec_padding_mask = create_padding_mask(inp)
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
    return enc_padding_mask, combined_mask, dec_padding_mask

In [14]:
learning_rate = CustomSchedule(d_model=128)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

train_loss = tf.keras.metrics.Mean(name='train_loss')
val_loss = tf.keras.metrics.Mean(name='val_loss')

In [15]:
@tf.function
def train_step(inputs):
    (inp, tar_inp), tar_real = inputs
    with tf.GradientTape() as tape:
        predictions, _ = transformer([inp, tar_inp], training=True)
        loss = loss_function(tar_real, predictions)
    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    train_loss(loss)

@tf.function
def val_step(inputs):
    (inp, tar_inp), tar_real = inputs
    predictions, _ = transformer([inp, tar_inp], training=False)
    loss = loss_function(tar_real, predictions)
    val_loss(loss)

In [22]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    'transformer_model.keras',
    monitor='val_loss',
    save_best_only=True,
    save_weights_only=False,
    mode='min',
    verbose=1
)

EPOCHS = 30
train_batches = len(list(train_dataset))
val_batches = len(list(val_dataset))

if transformer is None:
    raise ValueError("Transformer model was not properly initialized")

for epoch in range(EPOCHS):
    start_time = time.time()
    train_loss.reset_state()
    val_loss.reset_state()

    with tqdm(total=train_batches, desc=f'Epoch {epoch + 1}/{EPOCHS}', unit='batch') as pbar:
        for (batch, inputs) in enumerate(train_dataset):
            train_step(inputs)
            pbar.set_postfix({'Batch Loss': float(train_loss.result())})
            pbar.update(1)

    for inputs in val_dataset:
        val_step(inputs)

    epoch_time = time.time() - start_time
    print(f"Epoch {epoch + 1} | Train Loss: {train_loss.result():.4f} | Time: {epoch_time:.2f} sec")


    current_learning_rate = learning_rate(optimizer.iterations)
    print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Learning Rate: {current_learning_rate:.6f}')

Epoch 1/30: 100%|██████████| 168/168 [00:11<00:00, 14.81batch/s, Batch Loss=1.37]


Epoch 1 | Train Loss: 1.3657 | Time: 11.46 sec
Epoch 1 Loss 1.3657 Learning Rate: 0.001233


Epoch 2/30: 100%|██████████| 168/168 [00:20<00:00,  8.20batch/s, Batch Loss=1.25]


Epoch 2 | Train Loss: 1.2487 | Time: 20.61 sec
Epoch 2 Loss 1.2487 Learning Rate: 0.001291


Epoch 3/30: 100%|██████████| 168/168 [00:20<00:00,  8.20batch/s, Batch Loss=1.15]


Epoch 3 | Train Loss: 1.1474 | Time: 20.60 sec
Epoch 3 Loss 1.1474 Learning Rate: 0.001350


Epoch 4/30: 100%|██████████| 168/168 [00:20<00:00,  8.20batch/s, Batch Loss=1.06]


Epoch 4 | Train Loss: 1.0577 | Time: 20.60 sec
Epoch 4 Loss 1.0577 Learning Rate: 0.001392


Epoch 5/30: 100%|██████████| 168/168 [00:11<00:00, 14.90batch/s, Batch Loss=0.954]


Epoch 5 | Train Loss: 0.9539 | Time: 11.38 sec
Epoch 5 Loss 0.9539 Learning Rate: 0.001364


Epoch 6/30: 100%|██████████| 168/168 [00:20<00:00,  8.20batch/s, Batch Loss=0.851]


Epoch 6 | Train Loss: 0.8514 | Time: 20.58 sec
Epoch 6 Loss 0.8514 Learning Rate: 0.001337


Epoch 7/30: 100%|██████████| 168/168 [00:11<00:00, 14.93batch/s, Batch Loss=0.752]


Epoch 7 | Train Loss: 0.7517 | Time: 11.42 sec
Epoch 7 Loss 0.7517 Learning Rate: 0.001312


Epoch 8/30: 100%|██████████| 168/168 [00:20<00:00,  8.19batch/s, Batch Loss=0.676]


Epoch 8 | Train Loss: 0.6760 | Time: 20.63 sec
Epoch 8 Loss 0.6760 Learning Rate: 0.001289


Epoch 9/30: 100%|██████████| 168/168 [00:20<00:00,  8.20batch/s, Batch Loss=0.594]


Epoch 9 | Train Loss: 0.5939 | Time: 20.59 sec
Epoch 9 Loss 0.5939 Learning Rate: 0.001266


Epoch 10/30: 100%|██████████| 168/168 [00:11<00:00, 14.83batch/s, Batch Loss=0.536]


Epoch 10 | Train Loss: 0.5359 | Time: 11.44 sec
Epoch 10 Loss 0.5359 Learning Rate: 0.001245


Epoch 11/30: 100%|██████████| 168/168 [00:11<00:00, 14.77batch/s, Batch Loss=0.491]


Epoch 11 | Train Loss: 0.4908 | Time: 11.47 sec
Epoch 11 Loss 0.4908 Learning Rate: 0.001225


Epoch 12/30: 100%|██████████| 168/168 [00:11<00:00, 14.87batch/s, Batch Loss=0.441]


Epoch 12 | Train Loss: 0.4413 | Time: 11.40 sec
Epoch 12 Loss 0.4413 Learning Rate: 0.001205


Epoch 13/30: 100%|██████████| 168/168 [00:11<00:00, 14.97batch/s, Batch Loss=0.404]


Epoch 13 | Train Loss: 0.4035 | Time: 11.33 sec
Epoch 13 Loss 0.4035 Learning Rate: 0.001187


Epoch 14/30: 100%|██████████| 168/168 [00:20<00:00,  8.20batch/s, Batch Loss=0.369]


Epoch 14 | Train Loss: 0.3692 | Time: 20.68 sec
Epoch 14 Loss 0.3692 Learning Rate: 0.001170


Epoch 15/30: 100%|██████████| 168/168 [00:20<00:00,  8.19batch/s, Batch Loss=0.343]


Epoch 15 | Train Loss: 0.3435 | Time: 20.60 sec
Epoch 15 Loss 0.3435 Learning Rate: 0.001153


Epoch 16/30: 100%|██████████| 168/168 [00:11<00:00, 14.95batch/s, Batch Loss=0.317]


Epoch 16 | Train Loss: 0.3170 | Time: 11.34 sec
Epoch 16 Loss 0.3170 Learning Rate: 0.001137


Epoch 17/30: 100%|██████████| 168/168 [00:20<00:00,  8.20batch/s, Batch Loss=0.297]


Epoch 17 | Train Loss: 0.2965 | Time: 20.59 sec
Epoch 17 Loss 0.2965 Learning Rate: 0.001121


Epoch 18/30: 100%|██████████| 168/168 [00:20<00:00,  8.20batch/s, Batch Loss=0.282]


Epoch 18 | Train Loss: 0.2817 | Time: 20.60 sec
Epoch 18 Loss 0.2817 Learning Rate: 0.001106


Epoch 19/30: 100%|██████████| 168/168 [00:11<00:00, 14.96batch/s, Batch Loss=0.26]


Epoch 19 | Train Loss: 0.2596 | Time: 11.33 sec
Epoch 19 Loss 0.2596 Learning Rate: 0.001092


Epoch 20/30: 100%|██████████| 168/168 [00:20<00:00,  8.19batch/s, Batch Loss=0.243]


Epoch 20 | Train Loss: 0.2431 | Time: 20.69 sec
Epoch 20 Loss 0.2431 Learning Rate: 0.001078


Epoch 21/30: 100%|██████████| 168/168 [00:11<00:00, 14.94batch/s, Batch Loss=0.23]


Epoch 21 | Train Loss: 0.2298 | Time: 11.36 sec
Epoch 21 Loss 0.2298 Learning Rate: 0.001065


Epoch 22/30: 100%|██████████| 168/168 [00:11<00:00, 14.93batch/s, Batch Loss=0.219]


Epoch 22 | Train Loss: 0.2194 | Time: 11.35 sec
Epoch 22 Loss 0.2194 Learning Rate: 0.001052


Epoch 23/30: 100%|██████████| 168/168 [00:20<00:00,  8.19batch/s, Batch Loss=0.208]


Epoch 23 | Train Loss: 0.2077 | Time: 20.60 sec
Epoch 23 Loss 0.2077 Learning Rate: 0.001040


Epoch 24/30: 100%|██████████| 168/168 [00:20<00:00,  8.19batch/s, Batch Loss=0.195]


Epoch 24 | Train Loss: 0.1952 | Time: 20.59 sec
Epoch 24 Loss 0.1952 Learning Rate: 0.001028


Epoch 25/30: 100%|██████████| 168/168 [00:20<00:00,  8.20batch/s, Batch Loss=0.188]


Epoch 25 | Train Loss: 0.1876 | Time: 20.60 sec
Epoch 25 Loss 0.1876 Learning Rate: 0.001017


Epoch 26/30: 100%|██████████| 168/168 [00:11<00:00, 14.89batch/s, Batch Loss=0.176]


Epoch 26 | Train Loss: 0.1763 | Time: 11.47 sec
Epoch 26 Loss 0.1763 Learning Rate: 0.001005


Epoch 27/30: 100%|██████████| 168/168 [00:11<00:00, 14.76batch/s, Batch Loss=0.173]


Epoch 27 | Train Loss: 0.1725 | Time: 11.54 sec
Epoch 27 Loss 0.1725 Learning Rate: 0.000995


Epoch 28/30: 100%|██████████| 168/168 [00:11<00:00, 15.01batch/s, Batch Loss=0.159]


Epoch 28 | Train Loss: 0.1595 | Time: 11.37 sec
Epoch 28 Loss 0.1595 Learning Rate: 0.000984


Epoch 29/30: 100%|██████████| 168/168 [00:20<00:00,  8.19batch/s, Batch Loss=0.154]


Epoch 29 | Train Loss: 0.1535 | Time: 20.60 sec
Epoch 29 Loss 0.1535 Learning Rate: 0.000974


Epoch 30/30: 100%|██████████| 168/168 [00:20<00:00,  8.20batch/s, Batch Loss=0.153]

Epoch 30 | Train Loss: 0.1533 | Time: 20.59 sec
Epoch 30 Loss 0.1533 Learning Rate: 0.000964





In [23]:
def evaluate(sentence, transformer, input_text_processor, output_text_processor, max_length=MAX_SEQUENCE_LENGTH):
    if transformer is None:
        raise ValueError("Transformer model is None in evaluate function")
    sentence = preprocess_sentence(sentence, lang='ar')
    input_tensor = input_text_processor([sentence])
    input_tensor = input_tensor.to_tensor(shape=[1, max_length])
    start_token = output_text_processor(['[START]']).numpy()[0][0]
    end_token = output_text_processor(['[END]']).numpy()[0][0]
    output_tokens = [start_token]
    for i in range(max_length):
        decoder_input = tf.expand_dims(output_tokens, 0)
        predictions, _ = transformer([input_tensor, decoder_input], training=False)
        next_token = tf.argmax(predictions[0, -1, :], axis=-1).numpy()
        if next_token == end_token:
            break
        output_tokens.append(next_token)
    predicted_tokens = tf.constant(output_tokens)
    predicted_words = output_text_processor.get_vocabulary()
    translated = ' '.join([predicted_words[token] for token in output_tokens[1:]])
    return translated

def translate(sentence, transformer):
    translation = evaluate(sentence, transformer, input_text_processor, output_text_processor)
    print(f'\n🟢 Arabic Input     : {sentence}')
    print(f'🔵 English Translation : {translation}')

In [27]:
translate("كيف حالك؟", transformer)
translate("أنا أحبك", transformer)
translate("هل تحب والدك؟", transformer)


🟢 Arabic Input     : كيف حالك؟
🔵 English Translation : how are you doing ?

🟢 Arabic Input     : أنا أحبك
🔵 English Translation : i love you .

🟢 Arabic Input     : هل تحب والدك؟
🔵 English Translation : do you like your father ?
