# **Implementing Many-to-Many RNN for English-to-Urdu Language Translation and Exploring Its Limitations**

# **Part 1:** Many-to-Many Recurrent Neural Network (RNN) Implementation

## Data Preparation:

In [None]:
import pandas as pd


df = pd.read_excel('./parallel-corpus.xlsx')

# Keep only the first two columns
df = df.iloc[:, :2]

df.rename(columns = {'SENTENCES ':'SENTENCES'}, inplace = True)


df.head()

Step 1: Preprocess the Data

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Tokenize the sentences
tokenizer_eng = Tokenizer()
tokenizer_urdu = Tokenizer()

# Convert the 'SENTENCES' column to string type before fitting the tokenizer
df['SENTENCES'] = df['SENTENCES'].astype(str)
# Convert the 'MEANING' column to string type before fitting the tokenizer
df['MEANING'] = df['MEANING'].astype(str)

tokenizer_eng.fit_on_texts(df['SENTENCES'])
tokenizer_urdu.fit_on_texts(df['MEANING'])

eng_sequences = tokenizer_eng.texts_to_sequences(df['SENTENCES'])
urdu_sequences = tokenizer_urdu.texts_to_sequences(df['MEANING'])

# Pad sequences
max_len_eng = max(len(seq) for seq in eng_sequences)
max_len_urdu = max(len(seq) for seq in urdu_sequences)

max_len = max(max_len_eng,max_len_urdu)

eng_sequences = pad_sequences(eng_sequences, maxlen=max_len, padding='post')
urdu_sequences = pad_sequences(urdu_sequences, maxlen=max_len, padding='post')

# Vocabulary sizes
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_urdu = len(tokenizer_urdu.word_index) + 1

Step 2: Prepare the Data for Training

In [3]:
# Split the data into training and validation sets
train_size = int(len(eng_sequences) * 0.7)
test_size = int(len(eng_sequences) * 0.15)

# For English train, validation and test
x_train, x_temp = eng_sequences[:train_size], eng_sequences[train_size:]
x_test, x_val = x_temp[:test_size], x_temp[test_size:]

# For Urdu train, validation and test
y_train, y_temp = urdu_sequences[:train_size], urdu_sequences[train_size:]
y_test, y_val = y_temp[:test_size], y_temp[test_size:]


In [None]:
print(x_train.shape, y_train.shape )

Step 3: Build the RNN Model

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense
from keras_nlp.metrics import Bleu


# Build Model
model = Sequential(
    [
        Embedding(vocab_size_eng, 64,),
        SimpleRNN(64, return_sequences=True),
        Dense(vocab_size_urdu, activation='softmax')
    ]
)

# Compile Model
bleu_metric = Bleu(tokenizer=tokenizer_urdu.texts_to_sequences)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=[bleu_metric])

# Train Model
model.fit(x_train, y_train, epochs=100,validation_data=(x_val,y_val))



In [None]:
model.evaluate(x_test,y_test)

In [None]:
# Translate function
def translate(text):
    sequence = tokenizer_eng.texts_to_sequences([text])
    sequence = pad_sequences(sequence, maxlen=max_len_eng, padding='post')
    prediction = model.predict(sequence)
    predicted_sequence = np.argmax(prediction, axis=-1)
    translated_text = ' '.join([tokenizer_urdu.index_word[idx] for idx in predicted_sequence[0] if idx != 0])
    return translated_text

# Example translation
print(translate('hello'))