# **URDU-ENGLISH TRANSLATION USING CUSTOM SEQ2SEQ MODELS [WITH AND WITHOUT ATTENTION]**

## **IMPORT REQUIRED PACKAGES AND LIBRARIES**

In [1]:
import numpy as np 
import pandas as pd 
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


## **IMPORT REQUIRED DATASET**

In [2]:
ds=pd.read_csv("/kaggle/input/bible-dataset-with-english-to-urdu-translation/bible.csv",names=['English','Urdu'])

In [3]:
ds.head()

Unnamed: 0,English,Urdu
0,"The book of the generation of Jesus Christ , t...",یسُوع مسیح ابن داود ابن ابرہام کا نسب نامہ
1,Abraham begat Isaac ; and Isaac begat Jacob ; ...,ابراہام سے اِضحاق پیدا ہُوا اور اِضحاق سے یعقو...
2,And Judas begat Phares and Zara of Thamar ; an...,اور یہوداہ سے فارص اور زارح تمر سے پیدا ہوئے ا...
3,And Aram begat Aminadab ; and Aminadab begat N...,اور رام سے عمینداب پیدا ہُوا اور عمینداب سے نح...
4,And Salmon begat Booz of Rachab ; and Booz beg...,اور سلمون سے بوعز راحب سے پیدا ہُوا اور بوعز س...


## **DATA PREPROCESSING**

In [4]:
ds=ds.dropna()

In [5]:
import re
def preprocess_sentence(s):
  s=s.lower()
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  s = s.strip()
  return s

In [6]:
ds['English_clean']=ds['English'].apply(preprocess_sentence)
ds['Urdu_clean']=ds['Urdu'].apply(preprocess_sentence)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds['English_clean']=ds['English'].apply(preprocess_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds['Urdu_clean']=ds['Urdu'].apply(preprocess_sentence)


In [7]:
def tag_target_sentences(sentences):
  tagged_sentences = map(lambda s: (' ').join(['<sos>', s, '<eos>']), sentences)
  return list(tagged_sentences)

In [8]:
input_=list(ds['English_clean'])
target=tag_target_sentences(list(ds['Urdu_clean']))

## **DATA TOKENIZATION**

In [9]:
source_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<unk>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')
source_tokenizer.fit_on_texts(input_)
encoder_vocab=len(source_tokenizer.word_index)+1
encoder_inputs = source_tokenizer.texts_to_sequences(input_)

In [10]:
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<unk>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')
target_tokenizer.fit_on_texts(target)
decoder_vocab=len(target_tokenizer.word_index)+1
sequences = target_tokenizer.texts_to_sequences(target)
decoder_inputs = [s[:-1] for s in sequences] # Drop the last token in the sentence.
decoder_targets = [s[1:] for s in sequences] # Drop the first token in the sentence.

In [11]:
target_tokenizer.sequences_to_texts(decoder_targets[0:5])

['یسُوع مسیح ابن داود ابن ابرہام کا نسب نامہ <eos>',
 'ابراہام سے اِضحاق پیدا ہُوا اور اِضحاق سے یعقوب پیدا ہُوا اور یعقوب سے یہوداہ اور اس کے بھائی پیدا ہوئے ۔ <eos>',
 'اور یہوداہ سے فارص اور زارح تمر سے پیدا ہوئے اور فارص سے حصرون پیدا ہُوا اور حصرون سے رام پیدا ہُوا ۔ <eos>',
 'اور رام سے عمینداب پیدا ہُوا اور عمینداب سے نحسون پیدا ہُوا اور نحسون سے سلمون پیدا ہُوا ۔ <eos>',
 'اور سلمون سے بوعز راحب سے پیدا ہُوا اور بوعز سے عوبید رُوت سے پیدا ہُوا اور عوبید سے یسّی پیدا ہُوا ۔ <eos>']

In [12]:
max_encoding_len = len(max(encoder_inputs, key=len))
max_decoding_len = len(max(decoder_inputs, key=len))

In [13]:
padded_encoder_inputs = pad_sequences(encoder_inputs, max_encoding_len, padding='post', truncating='post')
padded_decoder_inputs = pad_sequences(decoder_inputs, max_decoding_len, padding='post', truncating='post')
padded_decoder_targets = pad_sequences(decoder_targets, max_decoding_len, padding='post', truncating='post')

## **CREATING MODELS**

### **SEQ2SEQ MODEL WITHOUT ATTENTION**

In [14]:
embedding_dim = 128
hidden_dim = 256
default_dropout=0.2

In [15]:
from tensorflow import keras
from tensorflow.keras import layers

class Encoder(layers.Layer):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_rate=0.1):
        super(Encoder, self).__init__(name='Encoder')

        self.embedding = layers.Embedding(vocab_size, embedding_dim)
        self.lstm = layers.LSTM(hidden_dim, return_state=True, dropout=dropout_rate)

    def call(self, inputs):
        embedding_output = self.embedding(inputs)
        encoder_outputs, state_h, state_c = self.lstm(embedding_output)
        encoder_states = [state_h, state_c]
        return encoder_outputs, encoder_states

class Decoder(layers.Layer):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_rate=0.1):
        super(Decoder, self).__init__(name='Decoder')

        self.embedding = layers.Embedding(vocab_size, embedding_dim)
        self.lstm = layers.LSTM(hidden_dim, return_sequences=True, return_state=True, dropout=dropout_rate)
        self.dense = layers.Dense(vocab_size, activation='softmax')

    def call(self, inputs, initial_states):
        embedding_output = self.embedding(inputs)
        decoder_outputs, state_hh, state_cc = self.lstm(embedding_output, initial_state=initial_states)
        decoder_states = [state_hh, state_cc]
        logits = self.dense(decoder_outputs)
        return logits,decoder_states

class TransformerSeq2Seq(keras.Model):
    def __init__(self, encoder_vocab_size, decoder_vocab_size, embedding_dim, hidden_dim,encoder_tokenizer,decoder_tokenizer, dropout_rate=0.1):
        super(TransformerSeq2Seq, self).__init__(name='Main_model_layer')

        self.encoder = Encoder(encoder_vocab_size, embedding_dim, hidden_dim, dropout_rate)
        self.decoder = Decoder(decoder_vocab_size, embedding_dim, hidden_dim, dropout_rate)
        self.encoder_tokenizer=encoder_tokenizer
        self.decoder_tokenizer=decoder_tokenizer
    def call(self, inputs):
        encoder_input, decoder_inputs = inputs
        encoder_output, encoder_states = self.encoder(encoder_input)
        decoder_logits,decoder_states = self.decoder(decoder_inputs, encoder_states)
        return decoder_logits
    
    def predict(self,inputs):
        encoder_input,max_decoder_len = inputs
        encoder_input = self.encoder_tokenizer.texts_to_sequences([encoder_input])
        max_encoding_len = len(max(encoder_input, key=len))
        encoder_input = pad_sequences(encoder_input, maxlen=max_encoding_len, padding='post')
        encoder_outputs, encoder_states = self.encoder(encoder_input)
        batch_size = int(tf.shape(encoder_input)[0])
        decoder_states = encoder_states
        current_word = '<sos>'
        start_token_id=self.decoder_tokenizer.word_index[current_word]
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = self.decoder_tokenizer.word_index[current_word]
        predictions = []
        count=10
        for _ in range(100):
            target_seq = np.zeros((1,1))
            target_seq[0, 0] = self.decoder_tokenizer.word_index[current_word]
            decoder_logits,decoder_state = self.decoder(target_seq, decoder_states)
            decoder_states=decoder_state
            decoder_predictions = tf.argmax(decoder_logits, axis=-1)
            if int(decoder_predictions[0])!=0:
                current_word = self.decoder_tokenizer.index_word[int(decoder_predictions[0])]
                predictions.append(current_word)
                if (current_word == '<eos>'):
                  break
            else:
                current_word = self.decoder_tokenizer.index_word[1]
                predictions.append(current_word)
        return predictions


model = TransformerSeq2Seq(encoder_vocab, decoder_vocab, embedding_dim, hidden_dim,source_tokenizer,target_tokenizer, 0.1)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',  metrics='sparse_categorical_accuracy')

#### **TRAINING MODEL**

In [16]:
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

history = model.fit([padded_encoder_inputs, padded_decoder_inputs], padded_decoder_targets,
                     batch_size=16,
                     epochs=30,
                     callbacks=[es_callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


#### **MODEL INFERENCE**

In [17]:
model.predict([ds['English_clean'][10], max_decoding_len])

['اور',
 'سردار',
 'کاہِن',
 'اور',
 'فقِیہ',
 'اور',
 'فقِیہ',
 'اور',
 'اُن',
 'کے',
 'تختے',
 'اور',
 'کبُوتر',
 'فروشوں',
 'کی',
 'چوکیاں',
 'اُلٹ',
 'دِیں',
 '۔',
 '<eos>']

### **SEQ2SEQ MODEL WITH ATTENTION**

In [19]:
from tensorflow import keras
from tensorflow.keras import layers

class Encoder(layers.Layer):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_rate=0.1):
        super(Encoder, self).__init__(name="Encoder")
        self.embedding = layers.Embedding(vocab_size, embedding_dim)
        self.lstm = layers.LSTM(hidden_dim, return_state=True, dropout=dropout_rate)
    def call(self, inputs):
        embedding_output = self.embedding(inputs)
        encoder_outputs, state_h, state_c = self.lstm(embedding_output)
        encoder_states = [state_h, state_c]
        return encoder_outputs, encoder_states


class LuongAttention(layers.Layer):
  def __init__(self, hidden_dim):
    super(LuongAttention, self).__init__()
    self.w = layers.Dense(hidden_dim, name='encoder_outputs_dense')

  def call(self, inputs):
    encoder_output_seq, decoder_output = inputs
    z = self.w(encoder_output_seq)  #make encoder and decoder seq_vectors of same length
    attention_scores = tf.matmul(decoder_output, z, transpose_b=True)
     
    #[ [d11,d12,d13]   *   [ [e11 e21  e31]    =    [ [a11  a12 a13]
    #  [d21,d22,d23]  *      [e12 e22 e23]     =      [a21 a22 a23]
    # [d31,d32,d33] ] *      [e13 e32 e33] ]    =     [a31 a32 a33] ]
    
    attention_weights = tf.keras.activations.softmax(attention_scores, axis=-1)
    
    #[ [0.2,0.3,0.5]   
    #  [0.4,0.3,0.3] 
    # [0.8,0.1,0.1] ]
    
    # [a1,a2,a3] *
    context = tf.matmul(attention_weights, encoder_output_seq)
    #[ [a11*d11 + a12*d21 + a13*d31    a21*d12 + a12*d22 + a23*d32    ]

    return context


class Decoder(layers.Layer):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_rate=0.1):
        super(Decoder, self).__init__(name="")

        self.embedding = layers.Embedding(vocab_size, embedding_dim)
        self.lstm = layers.LSTM(hidden_dim, return_sequences=True, return_state=True, dropout=dropout_rate)
        self.attention = LuongAttention(hidden_dim)
        self.w = layers.Dense(hidden_dim, activation='tanh', name='attended_outputs_dense')
        self.dense = layers.Dense(vocab_size, activation='softmax')

    def call(self,inputs):
        input_, initial_states, encoder_outputs=inputs
        embedding_output = self.embedding(input_)
        decoder_outputs, state_h, state_c = self.lstm(embedding_output, initial_state=initial_states)
        contexts = self.attention([encoder_outputs, decoder_outputs])
        decoder_outputs= self.w(tf.concat([contexts, decoder_outputs], axis= -1))
        logits = self.dense(decoder_outputs)
        return logits,[state_h,state_c]


class TransformerSeq2Seq(keras.Model):
    def __init__(self, encoder_vocab_size, decoder_vocab_size, embedding_dim, hidden_dim,encoder_tokenizer,decoder_tokenizer, dropout_rate=0.1):
        super(TransformerSeq2Seq, self).__init__(name='Main_model_layer')

        self.encoder = Encoder(encoder_vocab_size, embedding_dim, hidden_dim, dropout_rate)
        self.decoder = Decoder(decoder_vocab_size, embedding_dim, hidden_dim, dropout_rate)
        self.encoder_tokenizer=encoder_tokenizer
        self.decoder_tokenizer=decoder_tokenizer

    def call(self, inputs):
        encoder_in, decoder_inputs = inputs
        encoder_outputs, encoder_states = self.encoder(encoder_in)
        decoder_logits,decoder_states = self.decoder([decoder_inputs, encoder_states, encoder_outputs])
        return decoder_logits
    
    def predict(self,inputs):
        encoder_input,max_decoder_len = inputs
        encoder_input = self.encoder_tokenizer.texts_to_sequences([encoder_input])
        max_encoding_len = len(max(encoder_input, key=len))
        encoder_input = pad_sequences(encoder_input, maxlen=max_encoding_len, padding='post')
        encoder_outputs, encoder_states = self.encoder(encoder_input)
        decoder_states = encoder_states
        current_word = '<sos>'
        predictions = []
        count=10
        for _ in range(100):
            target_seq = np.zeros((1,1))
            target_seq[0, 0] = self.decoder_tokenizer.word_index[current_word]
            decoder_logits,decoder_state = self.decoder([target_seq, decoder_states,encoder_outputs])
            decoder_states=decoder_state
            decoder_predictions = tf.argmax(decoder_logits, axis=-1)
            if int(decoder_predictions[0])!=0:
                current_word = self.decoder_tokenizer.index_word[int(decoder_predictions[0])]
                predictions.append(current_word)
                if (current_word == '<eos>'):
                  break
            else:
                current_word = self.decoder_tokenizer.index_word[1]
                predictions.append(current_word)
        return predictions


model1 = TransformerSeq2Seq(encoder_vocab, decoder_vocab, embedding_dim, hidden_dim,source_tokenizer,target_tokenizer, 0.1)
model1.compile(optimizer='adam', loss='sparse_categorical_crossentropy',  metrics='sparse_categorical_accuracy')

#### **MODEL TRAINING**

In [20]:
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

history = model1.fit([padded_encoder_inputs, padded_decoder_inputs], padded_decoder_targets,
                     batch_size=4,
                     epochs=30,
                     callbacks=[es_callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


#### **MODEL INFERENCE**

In [21]:
model1.predict([ds['English_clean'][10], max_decoding_len])

['خُداوند',
 'کے',
 'نام',
 'یعقُوب',
 'کے',
 'لِئے',
 'سر',
 'ٹھہرا',
 'اور',
 'اُن',
 'کی',
 'رُوحوں',
 'اور',
 'جلال',
 'میں',
 'بھی',
 'رُسولوں',
 'کے',
 'نزدِیک',
 'پہُنچ',
 'گئے',
 '۔',
 '<eos>']

#### **SAVE MODEL**

In [23]:
model.save_weights("translation_model.h5")

## **CREATING INFERENCE MODEL FOR THE TRAINED MODELS (OPTIONAL)**

In [24]:
class InferenceTransformerSeq2Seq(keras.Model):
    def __init__(self, encoder_vocab_size, decoder_vocab_size, embedding_dim, hidden_dim, dropout_rate,encoder_tokenizer,decoder_tokenizer):
        super(InferenceTransformerSeq2Seq, self).__init__()
        self.encoder = Encoder(encoder_vocab_size, embedding_dim, hidden_dim, dropout_rate)
        self.decoder = Decoder(decoder_vocab_size, embedding_dim, hidden_dim, dropout_rate)
        self.encoder_tokenizer=encoder_tokenizer
        self.decoder_tokenizer=decoder_tokenizer

    def call(self, inputs):
        encoder_input,max_decoder_len = inputs
        encoder_input = self.encoder_tokenizer.texts_to_sequences(encoder_input)
        max_encoding_len = len(max(encoder_input, key=len))
        encoder_input = pad_sequences(encoder_input, maxlen=max_encoding_len, padding='post')
        encoder_outputs, encoder_states = self.encoder(encoder_input)
        decoder_states = encoder_states
        current_word = '<sos>'
        predictions = []
        count=10
        for _ in range(100):
            target_seq = np.zeros((1,1))
            target_seq[0, 0] = self.decoder_tokenizer.word_index[current_word]
            decoder_logits,decoder_state = self.decoder([target_seq, decoder_states,encoder_outputs])
            decoder_states=decoder_state
            decoder_predictions = tf.argmax(decoder_logits, axis=-1)
            if int(decoder_predictions[0])!=0:
                current_word = self.decoder_tokenizer.index_word[int(decoder_predictions[0])]
                predictions.append(current_word)
                if (current_word == '<eos>'):
                  break
            else:
                current_word = self.decoder_tokenizer.index_word[1]
                predictions.append(current_word)
        return predictions


## **INFERENCE MODEL FOR SEQ2SEQ WITHOUT ATTENTION**

In [25]:
inference_model_without_attention = InferenceTransformerSeq2Seq(encoder_vocab, decoder_vocab, embedding_dim, hidden_dim, 0.1,source_tokenizer,target_tokenizer)

In [26]:
encoder_input = [ds['English_clean'][2]]  # Prepare your encoder inputs
_ = inference_model_without_attention([encoder_input, 20])

encoder_weights = model.get_layer('Encoder').get_weights()
inference_model_without_attention.get_layer('Encoder').set_weights(encoder_weights)
decoder_weights = model.get_layer('Encoder').get_weights()
inference_model_without_attention.get_layer('Encoder').set_weights(decoder_weights)

predictions = inference_model_without_attention([encoder_input, max_decoding_len])

In [27]:
" ".join(predictions)

'پکار پکار پکار یِسُور یِسُور مُتعلّق مُتعلّق مُتعلّق مُتعلّق مُتعلّق بُلا بُلا بُلا بُلا بُلا بُلا بُلا بُلا بُلا بُلا ٹل ٹل بُلا ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل ٹل'

In [28]:
inference_model_without_attention.summary()

Model: "inference_transformer_seq2_seq"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Encoder (Encoder)           multiple                  1157760   
                                                                 
  (Decoder)                  multiple                  4098325   
                                                                 
Total params: 5,256,085
Trainable params: 5,256,085
Non-trainable params: 0
_________________________________________________________________


## **INFERENCE MODEL FOR SEQ2SEQ WITH ATTENTION**

In [29]:
inference_model_with_attention = InferenceTransformerSeq2Seq(encoder_vocab, decoder_vocab, embedding_dim, hidden_dim, 0.1,source_tokenizer,target_tokenizer)

In [30]:
encoder_input = [ds['English_clean'][2]]  # Prepare your encoder inputs
_ = inference_model_with_attention([encoder_input, 20])

encoder_weights = model1.get_layer('Encoder').get_weights()
inference_model_with_attention.get_layer('Encoder').set_weights(encoder_weights)
decoder_weights = model1.get_layer('Encoder').get_weights()
inference_model_with_attention.get_layer('Encoder').set_weights(decoder_weights)

predictions = inference_model_with_attention([encoder_input, max_decoding_len])

In [31]:
" ".join(predictions)

'حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں حرفوں'

In [32]:
inference_model_with_attention.summary()

Model: "inference_transformer_seq2_seq_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Encoder (Encoder)           multiple                  1157760   
                                                                 
  (Decoder)                  multiple                  4098325   
                                                                 
Total params: 5,256,085
Trainable params: 5,256,085
Non-trainable params: 0
_________________________________________________________________
