<a href="https://colab.research.google.com/github/Mahmoud-Rady2000/Deep-Learning./blob/main/Seq2seq_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install kaggle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!mkdir ~/.kaggle

In [3]:
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle

In [None]:
import os

os.system("chmod 600 ~/.kaggle/kaggle.json")

!kaggle datasets download -d samirmoustafa/arabic-to-english-translation-sentences

In [None]:
!unzip arabic-to-english-translation-sentences

---------------------------------------------------------------------------------------------------------------------

**Importing libraries and packages**


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense,LSTM,TimeDistributed,RepeatVector,GRU,Embedding
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

-----------------------------------------------------------------------------------------------------------------

**Loading dataset**

In [2]:
df=pd.read_csv("ara_eng.txt",delimiter="\t",names=["english","arabic"])

In [3]:
df.head()

Unnamed: 0,english,arabic
0,Hi.,مرحبًا.
1,Run!,اركض!
2,Help!,النجدة!
3,Jump!,اقفز!
4,Stop!,قف!


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24638 entries, 0 to 24637
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   english  24638 non-null  object
 1   arabic   24638 non-null  object
dtypes: object(2)
memory usage: 385.1+ KB


In [5]:
df.shape

(24638, 2)

In [6]:
# we get number of words in sentences to show some visuals
df["english_length"]=df["english"].apply(lambda x:len(x.split(" ")))

df["arabic_length"]=df["arabic"].apply(lambda x:len(x.split(" ")))
df.sample(5)


Unnamed: 0,english,arabic,english_length,arabic_length
5234,I walked from the station.,مشيت من المحطة.,5,3
15628,uganda soldiers for peace or oil global voices.,اوغندا جنود السلام ام البترول؟ الاصوات العالمية,8,7
7916,School begins at half past eight.,تبدأ المدرسة الساعة الثامنة و النصف.,6,6
1187,I'm left-handed.,أنا أيسر.,2,2
3246,The house is burning.,المنزل يحترق.,4,2


**Data preprocessing**

In [7]:
# english tokenizer | input tokenizer
english_tokenizer=Tokenizer()
english_tokenizer.fit_on_texts(df["english"])

In [8]:
vocab_size_english=len(english_tokenizer.word_index)
vocab_size_english

26062

In [9]:
english_word_2_idx=english_tokenizer.word_index
english_idx_2_word={idx:word for word,idx in english_word_2_idx.items()}
#print(english_idx_2_word)

In [10]:
# arabic tokenizer | output tokenizer
arabic_tokenizer=Tokenizer()
arabic_tokenizer.fit_on_texts(df["arabic"])

In [11]:
vocab_size_arabic=len(arabic_tokenizer.word_index)+1
vocab_size_arabic

57847

In [12]:
arabic_word_2_idx=arabic_tokenizer.word_index
arabic_idx_2_word={idx:word for word,idx in arabic_word_2_idx.items()}
#print(arabic_word_2_idx)

In [13]:
token_eng=english_tokenizer.texts_to_sequences(df["english"])
token_ara=arabic_tokenizer.texts_to_sequences(df["arabic"])
token_eng[0]

[3795]

In [14]:
token_eng[598]

[41, 12, 654]

In [15]:
padded_eng=pad_sequences(token_eng,maxlen=50,padding="post")
padded_ara=pad_sequences(token_ara,maxlen=50,padding="post")

In [16]:
padded_eng.shape

(24638, 50)

In [17]:
padded_ara.shape

(24638, 50)

In [18]:
padded_eng[0]

array([3795,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0], dtype=int32)

**Model Building**

In [19]:
model=Sequential()
model.add(Embedding(vocab_size_english,100,input_length=50))    #Maps each word to a 100-dimensional vector, input sequences 50 words.
model.add(tf.keras.layers.Bidirectional(LSTM(units=256)))
model.add(tf.keras.layers.RepeatVector(50))   #Replicates the final output of the Bidirectional LSTM 50 times.
model.add(LSTM(256,return_sequences=True))   #256 units
model.add(TimeDistributed(Dense(vocab_size_arabic,activation="softmax")))

"""Applies a fully connected layer independently to each time step in the sequence.
Produces probabilities for each word in the 'Arabic' vocabulary for each input word.
Uses softmax activation to ensure probabilities add up to 1."""

"Applies a fully connected layer independently to each time step in the sequence.\nProduces probabilities for each word in the 'Arabic' vocabulary for each input word.\nUses softmax activation to ensure probabilities add up to 1."

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 100)           2606200   
                                                                 
 bidirectional (Bidirection  (None, 512)               731136    
 al)                                                             
                                                                 
 repeat_vector (RepeatVecto  (None, 50, 512)           0         
 r)                                                              
                                                                 
 lstm_1 (LSTM)               (None, 50, 256)           787456    
                                                                 
 time_distributed (TimeDist  (None, 50, 57847)         14866679  
 ributed)                                                        
                                                        

In [21]:
opt=tf.keras.optimizers.RMSprop()
model.compile(loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
#if you will make a one hot encoding for the data we will use binary cross entropy insted of this loss function

**Splitting the dataset**

In [22]:
x_train,x_test,y_train,y_test=train_test_split(padded_eng,padded_ara,test_size=0.2,random_state=42)

In [24]:
x_train.shape,x_test.shape

((18478, 50), (6160, 50))

In [25]:
y_train.shape,y_test.shape

((18478, 50), (6160, 50))

In [34]:
x_train[0]

array([  75,  914,   39, 1533,    9, 1083,   29, 1911,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0], dtype=int32)

In [35]:
x_test[0]

array([ 565,    1,  162,   50,  404,    6, 3839,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0], dtype=int32)

**Model Training**

In [26]:
model.fit(x_train,y_train,batch_size=256,validation_split=0.2,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x79f6547e51e0>

**Model Evaluation**

In [27]:
model.evaluate(x_test,y_test)



[2.6277832984924316, 0.7217045426368713]

In [28]:
from joblib import dump

# save the model
model_path = 'model.joblib'
dump(model, model_path)
print("Model saved successfully.")

Model saved successfully.


In [29]:
from joblib import load

# Load the saved model
loaded_model = load(model_path)
print("Model loaded successfully.")

Model loaded successfully.


In [78]:
# Get the first English sentence (already padded)
test_sentence = x_test[100]
# Reshape to add batch dimension (None, sequence_length, embedding_dim)
test_sentence = np.expand_dims(test_sentence, axis=0)

# Make prediction
preds = model.predict(test_sentence)




In [79]:
x_test[100]

array([10484,   353,  3565,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0], dtype=int32)

In [80]:
preds.shape

(1, 50, 57847)

In [81]:
preds[0].shape

(50, 57847)

In [82]:
predicts=[]
for i in preds:
    predicts.append(np.argmax(i[0]))

In [83]:

arabic_idx_2_word[0]="<pad>"

In [84]:
print([english_idx_2_word[w] for w in x_test[100] if w != 0])

['ostriches', "can't", 'fly']


In [85]:
print([arabic_idx_2_word[w] for w in y_test[100] if w != 0])

['النعام', 'لا', 'يمكنه', 'الطيران']
