# English-to-Arabic Translation Model

This project implements a sequence-to-sequence neural network model with attention for translating English sentences to Arabic. The model is built using TensorFlow and Keras, leveraging LSTM layers and a custom attention mechanism for improved translation quality.


In [None]:
import pandas as pd

import numpy as np

import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer



from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, TimeDistributed, Bidirectional, Concatenate

from tensorflow.keras.models import Model

from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
df = pd.read_csv('/kaggle/input/arabic-to-english-translation-sentences/ara_eng.txt', encoding='utf-8', sep='\t', names=['English', 'Arabic'])

df

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

df

In [None]:
input_texts = df['English'].values

target_texts = df['Arabic'].values



target_texts = ["<start> " + text + " <end>" for text in target_texts]

In [None]:
tokenizer_in = Tokenizer()

tokenizer_in.fit_on_texts(input_texts)

input_sequences = tokenizer_in.texts_to_sequences(input_texts)

input_sequences = pad_sequences(input_sequences, padding='post')



# Tokenize the target sequences (Arabic)

tokenizer_out = Tokenizer(filters='')  # Disable filters to keep '<start>' and '<end>' tokens

tokenizer_out.fit_on_texts(target_texts)

target_sequences = tokenizer_out.texts_to_sequences(target_texts)

target_sequences = pad_sequences(target_sequences, padding='post')



# Get vocabulary sizes

num_encoder_tokens = len(tokenizer_in.word_index) + 1

num_decoder_tokens = len(tokenizer_out.word_index) + 1


In [None]:
# Get max sequence lengths

max_encoder_seq_length = max([len(seq) for seq in input_sequences])

max_decoder_seq_length = max([len(seq) for seq in target_sequences])



# Prepare the target data for the decoder (shifted by one for teacher forcing)

decoder_input_sequences = np.zeros_like(target_sequences)

decoder_input_sequences[:, 1:] = target_sequences[:, :-1]

decoder_input_sequences[:, 0] = tokenizer_out.word_index['<start>']



# Prepare the target data (shifted by one for the output)

decoder_target_sequences = np.zeros_like(target_sequences)

decoder_target_sequences[:, :-1] = target_sequences[:, 1:]

decoder_target_sequences[:, -1] = 0



# Expand dimensions of target data for sparse categorical crossentropy

decoder_target_sequences = np.expand_dims(decoder_target_sequences, -1)

In [None]:


# Hyperparameters

EMBEDDING_DIM = 256

HIDDEN_UNITS = 512



# Define the encoder

encoder_inputs = Input(shape=(None,))

encoder_embedding = Embedding(num_encoder_tokens, EMBEDDING_DIM)(encoder_inputs)

encoder_lstm = Bidirectional(LSTM(HIDDEN_UNITS, return_state=True, return_sequences=True))

encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)

state_h = Concatenate()([forward_h, backward_h])

state_c = Concatenate()([forward_c, backward_c])

encoder_states = [state_h, state_c]



# Define the decoder

decoder_inputs = Input(shape=(None,))

decoder_embedding = Embedding(num_decoder_tokens, EMBEDDING_DIM)(decoder_inputs)

decoder_lstm = LSTM(HIDDEN_UNITS * 2, return_sequences=True, return_state=True)

decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)



# Attention mechanism

attention = tf.keras.layers.Attention()

attention_output = attention([decoder_outputs, encoder_outputs])



# Concatenate attention output and decoder LSTM output

decoder_concat_input = Concatenate(axis=-1)([decoder_outputs, attention_output])



# Dense layer to generate predicted words

decoder_dense = TimeDistributed(Dense(num_decoder_tokens, activation='softmax'))

decoder_outputs = decoder_dense(decoder_concat_input)

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)



# Compile the model

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')



# Train the model

BATCH_SIZE = 64

EPOCHS = 100



model.fit([input_sequences, decoder_input_sequences], decoder_target_sequences, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2)




In [None]:
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])



# Decoder model

decoder_state_input_h = Input(shape=(HIDDEN_UNITS * 2,))

decoder_state_input_c = Input(shape=(HIDDEN_UNITS * 2,))

decoder_hidden_state_input = Input(shape=(max_encoder_seq_length, HIDDEN_UNITS * 2))



decoder_outputs, state_h, state_c = decoder_lstm(

    decoder_embedding, initial_state=[decoder_state_input_h, decoder_state_input_c]

)



attention_output = attention([decoder_outputs, decoder_hidden_state_input])

decoder_concat_input = Concatenate(axis=-1)([decoder_outputs, attention_output])

decoder_outputs = decoder_dense(decoder_concat_input)



decoder_model = Model(

    [decoder_inputs] + [decoder_hidden_state_input, decoder_state_input_h, decoder_state_input_c],

    [decoder_outputs] + [state_h, state_c]

)


In [None]:
# Translation function

def decode_sequence(input_seq):

    # Encode the input as state vectors.

    enc_out, h, c = encoder_model.predict(input_seq)



    # Generate empty target sequence of length 1.

    target_seq = np.zeros((1, 1))

    target_seq[0, 0] = tokenizer_out.word_index['<start>']



    stop_condition = False

    decoded_sentence = ""



    while not stop_condition:

        output_tokens, h, c = decoder_model.predict([target_seq, enc_out, h, c])



        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        sampled_word = tokenizer_out.index_word[sampled_token_index]

        decoded_sentence += " " + sampled_word



        if sampled_word == '<end>' or len(decoded_sentence.split()) > max_decoder_seq_length:

            stop_condition = True



        target_seq = np.zeros((1, 1))

        target_seq[0, 0] = sampled_token_index



    return decoded_sentence.strip('<start> ').strip(' <end>')




In [None]:
def translate(input_text):

    # Tokenize the input text

    input_sequence = tokenizer_in.texts_to_sequences([input_text])

    input_sequence = pad_sequences(input_sequence, maxlen=max_encoder_seq_length, padding='post')



    # Perform translation

    decoded_sentence = decode_sequence(input_sequence)



    return decoded_sentence



# Example usage:

input_text = "I'm sorry"

translated_sentence = translate(input_text)

print("Translated sentence:", translated_sentence)
