# **Implementing Many-to-Many RNN for English-to-Urdu Language Translation and Exploring Its Limitations**

# **Part 1:** Many-to-Many Recurrent Neural Network (RNN) Implementation

## Data Preparation:

In [8]:
import pandas as pd
from pathlib import Path
path = Path("")
df = pd.read_excel('./parallel-corpus.xlsx')

# Keep only the first two columns
df = df.iloc[:, :2]

df.head()

Unnamed: 0,SENTENCES,MEANING
0,How can I communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,How can I make friends?’,میں دوست کیسے بنائوں ؟
2,Why do I get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"If you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"Depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...


Step 1: Preprocess the Data

In [16]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Tokenize the sentences
tokenizer_eng = Tokenizer()
tokenizer_urdu = Tokenizer()

tokenizer_eng.fit_on_texts(df['SENTENCES'])
tokenizer_urdu.fit_on_texts(df['MEANING'])

eng_sequences = tokenizer_eng.texts_to_sequences(df['SENTENCES'])
urdu_sequences = tokenizer_urdu.texts_to_sequences(df['MEANING'])

# Pad sequences
max_len_eng = max(len(seq) for seq in eng_sequences)
max_len_urdu = max(len(seq) for seq in urdu_sequences)

eng_sequences = pad_sequences(eng_sequences, maxlen=max_len_eng, padding='post')
urdu_sequences = pad_sequences(urdu_sequences, maxlen=max_len_urdu, padding='post')

# Vocabulary sizes
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_urdu = len(tokenizer_urdu.word_index) + 1

ModuleNotFoundError: No module named 'tensorflow'

Step 2: Prepare the Data for Training

In [None]:
# Split the data into training and validation sets
train_size = int(len(eng_sequences) * 0.8)
eng_train, eng_val = eng_sequences[:train_size], eng_sequences[train_size:]
urdu_train, urdu_val = urdu_sequences[:train_size], urdu_sequences[train_size:]

Step 3: Build the RNN Model

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, SimpleRNN, Embedding, Dense

# Encoder
encoder_inputs = Input(shape=(max_len_eng,))
encoder_embedding = Embedding(vocab_size_eng, 256)(encoder_inputs)
encoder_rnn = SimpleRNN(256, return_sequences=True, return_state=True)
encoder_outputs, state_h = encoder_rnn(encoder_embedding)

# Decoder
decoder_inputs = Input(shape=(max_len_urdu,))
decoder_embedding = Embedding(vocab_size_urdu, 256)(decoder_inputs)
decoder_rnn = SimpleRNN(256, return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_rnn(decoder_embedding, initial_state=state_h)
decoder_dense = Dense(vocab_size_urdu, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

model.summary()

Step 4: Train the Model

In [None]:
# Prepare the target data
urdu_train_out = np.expand_dims(urdu_train, -1)
urdu_val_out = np.expand_dims(urdu_val, -1)

# Train the model
model.fit([eng_train, urdu_train], urdu_train_out, 
          validation_data=([eng_val, urdu_val], urdu_val_out),
          epochs=50, batch_size=64)

Step 5: Evaluate and Translate

In [None]:
# Function to translate a new sentence
def translate_sentence(sentence):
    sequence = tokenizer_eng.texts_to_sequences([sentence])
    sequence = pad_sequences(sequence, maxlen=max_len_eng, padding='post')
    states_value = encoder_model.predict(sequence)
    
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_urdu.word_index['starttoken']
    
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        output_tokens, h = decoder_model.predict([target_seq] + states_value)
        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer_urdu.index_word[sampled_token_index]
        
        if sampled_word == 'endtoken' or len(decoded_sentence) > max_len_urdu:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word
        
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        
        states_value = [h]
    
    return decoded_sentence

# Example translation
print(translate_sentence("hello"))