<a href="https://colab.research.google.com/github/Holy-Morphism/Urdu-RNN/blob/main/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Implementing Many-to-Many RNN for English-to-Urdu Language Translation and Exploring Its Limitations**

# **Part 1:** Many-to-Many Recurrent Neural Network (RNN) Implementation

## Data Preparation:

### Loading the Data:

In [3]:
import pandas as pd

df = pd.read_excel('./parallel-corpus.xlsx')

# Keep only the first two columns
df = df.iloc[:, :2]

df.rename(columns = {'SENTENCES ':'SENTENCES'}, inplace = True)

df = df.dropna()
df = df[df['SENTENCES'].str.len() > 3]

df.head()

Unnamed: 0,SENTENCES,MEANING
0,How can I communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,How can I make friends?’,میں دوست کیسے بنائوں ؟
2,Why do I get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"If you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"Depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...


### Preprocess data in both English and Urdu

In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize the sentences
tokenizer_eng = Tokenizer()
tokenizer_urdu = Tokenizer()

# Convert the 'SENTENCES' and 'MEANING' columns to string type before fitting the tokenizer
df['SENTENCES'] = df['SENTENCES'].astype(str)
df['MEANING'] = df['MEANING'].astype(str)

# Fit the tokenizers on English and Urdu texts
tokenizer_eng.fit_on_texts(df['SENTENCES'])
tokenizer_urdu.fit_on_texts(df['MEANING'])

# Convert texts to sequences
eng_sequences = tokenizer_eng.texts_to_sequences(df['SENTENCES'])
urdu_sequences = tokenizer_urdu.texts_to_sequences(df['MEANING'])

# Pad sequences to the same length
max_len_eng = max(len(seq) for seq in eng_sequences)
max_len_urdu = max(len(seq) for seq in urdu_sequences)
max_len = max(max_len_eng, max_len_urdu)

eng_sequences = pad_sequences(eng_sequences, maxlen=max_len, padding='post')
urdu_sequences = pad_sequences(urdu_sequences, maxlen=max_len, padding='post')

# Vocabulary sizes
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_urdu = len(tokenizer_urdu.word_index) + 1


### Split the dataset into training, validation, and test sets

In [5]:
# Split the data into training, validation, and test sets
train_size = int(len(eng_sequences) * 0.7)
test_size = int(len(eng_sequences) * 0.15)

# For English (input) sequences
x_train, x_temp = eng_sequences[:train_size], eng_sequences[train_size:]
x_test, x_val = x_temp[:test_size], x_temp[test_size:]

# For Urdu (target) sequences
y_train, y_temp = urdu_sequences[:train_size], urdu_sequences[train_size:]
y_test, y_val = y_temp[:test_size], y_temp[test_size:]

# Prepare decoder input data (shifted by one position)
decoder_input_data = np.zeros_like(urdu_sequences)
decoder_input_data[:, 1:] = urdu_sequences[:, :-1]
decoder_input_data[:, 0] = tokenizer_urdu.word_index.get('<start>', 0)  # Use a start token if defined


In [6]:
print(x_train.shape, y_train.shape )

(20647, 938) (20647, 938)


## Model Architecture

In [7]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, SimpleRNN, Dense, Embedding

# Encoder model
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=vocab_size_eng, output_dim=64)(encoder_inputs)
encoder_rnn = SimpleRNN(64, return_state=True)
encoder_outputs, state_h = encoder_rnn(encoder_embedding)
encoder_states = [state_h]  # For SimpleRNN, we only need the hidden state

# Decoder model
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=vocab_size_urdu, output_dim=64)(decoder_inputs)
decoder_rnn = SimpleRNN(64, return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_rnn(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_urdu, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)



In [8]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


# Print the model summary
model.summary()

In [11]:
# Train the model
model.fit(
    [x_train, decoder_input_data[:train_size]], y_train,
    epochs=50,
    validation_data=([x_val, decoder_input_data[train_size:train_size + len(x_val)]], y_val)
)

Epoch 1/50
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m271s[0m 419ms/step - accuracy: 0.9828 - loss: 0.1312 - val_accuracy: 0.9823 - val_loss: 0.1688
Epoch 2/50
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m297s[0m 380ms/step - accuracy: 0.9839 - loss: 0.1121 - val_accuracy: 0.9817 - val_loss: 0.1768
Epoch 3/50
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 381ms/step - accuracy: 0.9844 - loss: 0.1047 - val_accuracy: 0.9808 - val_loss: 0.1855
Epoch 4/50
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 383ms/step - accuracy: 0.9844 - loss: 0.1016 - val_accuracy: 0.9810 - val_loss: 0.1887
Epoch 5/50
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 382ms/step - accuracy: 0.9850 - loss: 0.0951 - val_accuracy: 0.9811 - val_loss: 0.1923
Epoch 6/50
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 380ms/step - accuracy: 0.9852 - loss: 0.0926 - val_accuracy: 0.9807 - val_loss: 0.1981
Epoc

<keras.src.callbacks.history.History at 0x7d95b7d135e0>

## Evaluation

In [None]:
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.metrics import accuracy_score

# Model to generate predictions (inference mode for encoder)
encoder_model = Model(encoder_inputs, encoder_states)

# Re-define the decoder model to use states from encoder during inference
decoder_state_input_h = Input(shape=(64,))
# decoder_state_input_c = Input(shape=(64,)) # Remove this line as SimpleRNN only uses hidden state
# decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c] # Change this line
decoder_states_inputs = [decoder_state_input_h] # SimpleRNN only needs hidden state

decoder_outputs, state_h = decoder_rnn(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs, # Updated to reflect change above
    [decoder_outputs] + decoder_states
)

# Function to convert sequence to words
def sequence_to_text(tokenizer, sequence):
    reverse_word_index = {index: word for word, index in tokenizer.word_index.items()}
    return [reverse_word_index.get(i, '<unk>') for i in sequence]

# Function to generate translation for an input sequence
def translate_sequence(input_seq):
    # Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)



    # Generate the start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_urdu.word_index.get('<start>', 0)

    stop_condition = False
    translated_sentence = []

    while not stop_condition:
        # states_value_reshaped = states_value[0].reshape(1, states_value[0].shape[0]) # reshape to (1, 64)

        # Pass the reshaped state as both h and c (since SimpleRNN only uses h)
        output_tokens, h = decoder_model.predict([target_seq] + [states_value]) # Change this line

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer_urdu.index_word.get(sampled_token_index, '<unk>')

        if sampled_word == '<end>' or len(translated_sentence) > max_len:
            stop_condition = True
        else:
            translated_sentence.append(sampled_word)

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

         # Update states
        states_value = [h] # reshape h for next iteration

    return ' '.join(translated_sentence)

# Evaluate BLEU score on the test set
def evaluate_bleu_score(x_test, y_test):
    bleu_scores = []
    smooth_fn = SmoothingFunction().method1  # Smoothing for short sentences

    for i in range(len(x_test)):
        input_seq = x_test[i:i+1]
        predicted_translation = translate_sequence(input_seq)
        actual_translation = sequence_to_text(tokenizer_urdu, y_test[i])

        # Remove padding tokens and the start/end tokens
        predicted_translation = [word for word in predicted_translation.split() if word not in ['<start>', '<end>', '<unk>']]
        actual_translation = [word for word in actual_translation if word not in ['<start>', '<end>', '<unk>', '0']]

        # Compute BLEU score
        bleu_score = sentence_bleu([actual_translation], predicted_translation, smoothing_function=smooth_fn)
        bleu_scores.append(bleu_score)

    return np.mean(bleu_scores)

# Calculate the accuracy for predictions on the test set
def evaluate_accuracy(x_test, y_test):
    correct_predictions = 0
    total_predictions = 0

    for i in range(len(x_test)):
        input_seq = x_test[i:i+1]
        predicted_translation = translate_sequence(input_seq)
        actual_translation = sequence_to_text(tokenizer_urdu, y_test[i])

        # Remove padding tokens and the start/end tokens
        predicted_translation = [word for word in predicted_translation.split() if word not in ['<start>', '<end>', '<unk>']]
        actual_translation = [word for word in actual_translation if word not in ['<start>', '<end>', '<unk>', '0']]

        # Update correct and total predictions
        correct_predictions += sum(1 for p, a in zip(predicted_translation, actual_translation) if p == a)
        total_predictions += len(actual_translation)  # Consider the actual length as the base

    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy

# Evaluate BLEU score and accuracy on the test set
bleu_score = evaluate_bleu_score(x_test, y_test)
accuracy = evaluate_accuracy(x_test, y_test)

# Print the results
print(f'BLEU Score on Test Set: {bleu_score:.4f}')
print(f'Accuracy on Test Set: {accuracy:.4f}')


### Translating Sentences

#### Defining the test function

In [None]:
def translate(text):
    # Tokenize and pad the input text
    sequence = tokenizer_eng.texts_to_sequences([text])
    sequence = pad_sequences(sequence, maxlen=max_len, padding='post')

    # Get the encoder states (hidden states) from the encoder
    states_value = encoder_model.predict(sequence)

    # Prepare the target sequence with the <start> token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_urdu.word_index['<start>']

    # Initialize variables
    stop_condition = False
    translated_text = ''

    # Loop to generate the translation iteratively
    while not stop_condition:
        # Predict the next word in the sequence
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Get the index of the predicted word
        predicted_idx = np.argmax(output_tokens[0, -1, :])
        predicted_word = tokenizer_urdu.index_word.get(predicted_idx, '')

        # Append the predicted word to the translated text
        if predicted_word == '<end>':
            stop_condition = True
        else:
            translated_text += ' ' + predicted_word

        # Update the target sequence with the predicted word
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = predicted_idx

        # Update states
        states_value = [h, c]

    return translated_text.strip()


#### First Five Sentences

In [None]:
for sentence in df.head()['SENTENCES']:
  print(sentence)
  print(translate(sentence))

## Saving the model

In [16]:
model.save("english_urdu_RNN_f219258.keras")