In [563]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [12]:
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)
path_to_file = os.path.join(path_to_zip)+"/spa-eng/spa.txt"

In [6]:
path_to_file

'/root/.keras/datasets/spa-eng_extracted/spa-eng/spa.txt'

In [145]:
# with open(path_to_file,encoding='utf-8') as f:
#     df=f.read().strip().split('\n')

In [146]:
df[:10]

['Hi.\tمرحبًا.\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #629296 (Samer)',
 'Run!\tاركض!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #1245450 (saeb)',
 'Duck!\tاخفض رأسك!\tCC-BY 2.0 (France) Attribution: tatoeba.org #280158 (CM) & #9036391 (KeEichi)',
 'Duck!\tاخفضي رأسك!\tCC-BY 2.0 (France) Attribution: tatoeba.org #280158 (CM) & #9036392 (KeEichi)',
 'Duck!\tاخفضوا رؤوسكم!\tCC-BY 2.0 (France) Attribution: tatoeba.org #280158 (CM) & #9036393 (KeEichi)',
 'Help!\tالنجدة!\tCC-BY 2.0 (France) Attribution: tatoeba.org #435084 (lukaszpp) & #371293 (saeb)',
 'Jump!\tاقفز!\tCC-BY 2.0 (France) Attribution: tatoeba.org #1102981 (jamessilver) & #6009426 (damascene)',
 'Stop!\tقف!\tCC-BY 2.0 (France) Attribution: tatoeba.org #448320 (CM) & #1245447 (saeb)',
 'Stop!\tتوقف !\tCC-BY 2.0 (France) Attribution: tatoeba.org #448320 (CM) & #5496702 (Wildflower81)',
 'Wait!\tإنتظر\tCC-BY 2.0 (France) Attribution: tatoeba.org #1744314 (belgavox) & #5496709 (Wildflow

In [439]:
df[0].split('\t')

['Hi.',
 'مرحبًا.',
 'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #629296 (Samer)']

In [148]:
len(df)

12569

In [149]:
import re

In [150]:
re.sub(r"([?.!,¿])", r" \1 ", "Go.")

'Go . '

In [384]:
def preprocess_sentence(w):
    w = re.sub(r"([?.!،,¿])", r" \1 ", w)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    # w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = re.sub(r'\s+', ' ', w)
    w = w.lower().strip()
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [385]:
preprocess_sentence("hello from the other sidE.")

'<start> hello from the other side . <end>'

In [386]:
def create_dataset(path_to_file,num_examples=None):
    with open(path_to_file,encoding='utf-8') as f:
        df=f.read().strip().split('\n')
    pairs=[[preprocess_sentence(sent) for sent in line.split('\t')]for line in df[:num_examples]]
    return zip(*pairs)

In [387]:
eng,ara,_=create_dataset(path_to_file)

In [393]:
len(eng)

12569

In [394]:
eng[-1]

"<start> there are mothers and fathers who will lie awake after the children fall asleep and wonder how they'll make the mortgage , or pay their doctor's bills , or save enough for their child's college education . <end>"

In [395]:
ara[-1]

'<start> وهناك أمهات وآباء سيظلون مستيقظين بعد أن ينام أطفالهم ، يتساءلون عن كيف سيسددون أقساط الرهن العقاري الذي اشترَوْ به بيتهم ، وكيف سيدفعون فواتير أطبائهم ، أو توفير ما يحتاجونه من مال لتسديد رسوم تسجيل أبنائهم في الجامعات . <end>'

# Try another way

In [468]:
def custom_standardize(text):
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, r"[^\w\s<>]", " ")
    text = tf.strings.regex_replace(text, r"\s+", " ")
    text = tf.strings.strip(text)
    return text
    
src_vectorizer = tf.keras.layers.TextVectorization(
    standardize=custom_standardize,

    output_mode="int",
    # output_sequence_length=30
)
src_vectorizer.adapt(eng)
input_tensor = src_vectorizer(eng)

In [469]:
print(f"\nEnglish vocab size: {len(src_vectorizer.get_vocabulary())}")
print(f"First 10 English tokens: {src_vectorizer.get_vocabulary()[:10]}")


English vocab size: 4342
First 10 English tokens: ['', '[UNK]', '<start>', '<end>', 'i', 'you', 'the', 'to', 'a', 'is']


In [470]:
def arabic_standardize(text):
    text = tf.strings.strip(text)                  # remove leading/trailing spaces
    text = tf.strings.lower(text)            
    text = tf.strings.regex_replace(text, r"\s+", " ")  
    text = tf.strings.strip(text)
    return text

In [471]:
tgt_vectorizer = tf.keras.layers.TextVectorization(
    standardize=arabic_standardize,
    output_mode="int",
        # output_sequence_length=30

)
tgt_vectorizer.adapt(ara)
target_tensor = tgt_vectorizer(ara)

print(f"\nArabic vocab size: {len(tgt_vectorizer.get_vocabulary())}")
print(f"First 10 Arabic tokens: {tgt_vectorizer.get_vocabulary()[:10]}")


Arabic vocab size: 13512
First 10 Arabic tokens: ['', '[UNK]', '<start>', '<end>', '.', 'توم', 'من', 'أن', 'لا', 'في']


In [472]:
decoder_inp = target_tensor[:, :-1]
decoder_tar = target_tensor[:, 1:]

eng_vocab = len(src_vectorizer.get_vocabulary())
ara_vocab = len(tgt_vectorizer.get_vocabulary())

# Tensorflow pipline

In [473]:
buffer_size=len(input_tensor) 
batch_size=64

In [596]:
dataset = (
    tf.data.Dataset.from_tensor_slices(((input_tensor, decoder_inp), decoder_tar))
    .shuffle(buffer_size)
    .batch(batch_size)
    .prefetch(tf.data.AUTOTUNE)
)

In [597]:
decoder_tar

<tf.Tensor: shape=(12569, 41), dtype=int64, numpy=
array([[ 6567,     4,     3, ...,     0,     0,     0],
       [ 4828,    26,     3, ...,     0,     0,     0],
       [12393,  1033,    26, ...,     0,     0,     0],
       ...,
       [ 3259,  2133,    15, ...,     0,     0,     0],
       [ 8524,    30,  8766, ...,     0,     0,     0],
       [ 5593, 12792,  5828, ..., 11885,     4,     3]])>

In [601]:
decoder_tar[0]

<tf.Tensor: shape=(41,), dtype=int64, numpy=
array([6567,    4,    3,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0])>

# Model Building & Training

In [658]:
from tensorflow.keras import Model,Input
from tensorflow.keras import layers

In [659]:
# Encoder
units=1024
embed_dim=256
# Encoder
encoder_inputs=Input(shape=(None,))
x=layers.Embedding(input_dim=eng_vocab,output_dim=embed_dim,name='Encoder_Embed')(encoder_inputs)
encoder_outputs ,state_h,state_c= layers.LSTM(units, return_sequences=True,return_state=True,name='Encoder_LSTM')(x)
encoder_states=[state_h,state_c]

In [660]:
# Decoder
decoder_inputs=Input(shape=(None,))
dec_embed=layers.Embedding(input_dim=ara_vocab,output_dim=embed_dim,name='Decoder_Embed')(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(units, return_sequences=True, return_state=True,name='Decoder_LSTM')
# decoder_dense = tf.keras.layers.Dense(ara_vocab, activation='softmax')

In [None]:
# Attention layer
attention = layers.AdditiveAttention(name="Attention_Layer")

# Compute decoder LSTM outputs
decoder_outputs, _, _ = decoder_lstm(dec_embed, initial_state=encoder_states)

# Apply attention between encoder outputs and decoder outputs
attention_out = attention([decoder_outputs, encoder_outputs])

# Concatenate context (attention) and decoder outputs
decoder_concat_input = layers.Concatenate(axis=-1, name="Concat_Layer")([decoder_outputs, attention_out])


# Final dense softmax layer
decoder_dense = layers.Dense(spa_vocab, activation="softmax", name='Output_Layer')
final_outputs = decoder_dense(decoder_concat_input)

# Seq2Seq Model with attention
model = Model([encoder_inputs, decoder_inputs], final_outputs)

In [667]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [668]:
model.summary()

In [669]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint(
    filepath='seq2seq_attention.weights.h5',
    # filepath='seq2seq_epoch_{epoch:02d}.h5',
    save_weights_only=True,
    save_freq='epoch'
)

In [670]:
# # from tensorflow.keras.models import load_model
# model.load_weights('/kaggle/working/seq2seq_attention.weights.h5')

In [672]:
model.fit(dataset, epochs=20, verbose=1,callbacks=[checkpoint])

Epoch 1/20
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 439ms/step - accuracy: 0.8920 - loss: 0.8374
Epoch 2/20
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 434ms/step - accuracy: 0.8962 - loss: 0.7636
Epoch 3/20
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 431ms/step - accuracy: 0.9006 - loss: 0.6844
Epoch 4/20
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 431ms/step - accuracy: 0.9050 - loss: 0.5974
Epoch 5/20
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 431ms/step - accuracy: 0.9126 - loss: 0.4929
Epoch 6/20
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 431ms/step - accuracy: 0.9237 - loss: 0.3890
Epoch 7/20
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 431ms/step - accuracy: 0.9369 - loss: 0.2976
Epoch 8/20
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 431ms/step - accuracy: 0.9503 - loss: 0.2200
Epoch 9/20
[1m1

<keras.src.callbacks.history.History at 0x7d9fd8179d10>

In [673]:
# model.fit(
#     x=[input_tensor,decoder_inp],y= tf.expand_dims(decoder_tar, -1),
#     batch_size=128,
#     epochs=30,
#     verbose=1
# )

# Inference model


In [674]:
# Encoder model
encoder_model=Model(encoder_inputs,[encoder_outputs, state_h, state_c])

In [675]:
# decoder model
decoder_state_input_h = tf.keras.Input(shape=(units,))
decoder_state_input_c = tf.keras.Input(shape=(units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_hidden_state_input = tf.keras.Input(shape=(None, units), name="encoder_outputs")

In [676]:
# Get embedding for the current input word
dec_embed2 = model.get_layer("Decoder_Embed")(decoder_inputs)  #<start>
# Run LSTM one step
decoder_lstm = model.get_layer("Decoder_LSTM")
decoder_outputs2, state_h2, state_c2=decoder_lstm(dec_embed2,initial_state=decoder_states_inputs) #<states of sentence>

# Apply attention
attention = model.get_layer("Attention_Layer")
attention_out_inf = attention([decoder_outputs2, decoder_hidden_state_input])

# Concatenate and generate softmax output
concat_inf = model.get_layer("Concat_Layer")([decoder_outputs2, attention_out_inf])
dense = model.get_layer("Output_Layer")
decoder_outputs2 = dense(concat_inf)

# Final inference decoder model
decoder_model = Model(
    [decoder_inputs, decoder_hidden_state_input, decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2, state_h2, state_c2]
)


# dec_output=decoder_dense(dec_output)
# dec_states=[dec_h,dec_c]
# decoder_model=Model(inputs=[decoder_inputs] + decoder_states_inputs,outputs= [dec_output]+ dec_states)

In [677]:
decoder_model.summary()

In [678]:
# reverse_tgt_vocab = tgt_vectorizer.get_vocabulary()

In [679]:
# idx_to_word = dict(enumerate(reverse_tgt_vocab))
# word_to_idx = {w:i for i,w in idx_to_word.items()}

In [706]:
def decode_sequence(input_sentence,max_len=40):
    encoder_outs, h, c =encoder_model.predict(src_vectorizer([input_sentence]))
    # 
    vocab = tgt_vectorizer.get_vocabulary()
    start_idx = vocab.index("<start>")
    end_idx = vocab.index("<end>")

    target_seq = np.array([[start_idx]], dtype="int32")
    decoded_sentence = []
    for _ in range(max_len):
        output_tokens, h, c = decoder_model.predict([target_seq, encoder_outs, h, c],verbose=0)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = vocab[sampled_token_index]
        if sampled_word == "<end>":
            break

        decoded_sentence.append(sampled_word)
        target_seq = np.array([[sampled_token_index]], dtype="int32")

    return " ".join(decoded_sentence)

In [707]:
print(decode_sequence("please come to me   !"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
تعال من فضلك تعال إلى أي .


In [711]:
print(decode_sequence(" i am hungry"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
بالجوع


In [712]:
print(decode_sequence("please help me !"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
من فضلك إفعل لي .


In [718]:
print(decode_sequence("How are you"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
لك هذا؟


In [723]:
print(decode_sequence("please"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
يعد على أحداً تربيت؟


In [722]:
print(decode_sequence("My name is "))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
لنا


In [685]:
print(decode_sequence("i like you?"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
لك
