<a href="https://colab.research.google.com/github/Holy-Morphism/Urdu-RNN/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Implementing Many-to-Many RNN for English-to-Urdu Language Translation and Exploring Its Limitations**

# **Part 3:** Resolving RNN Limitations Using Long Short-Term Memory (LSTM)

## Data Preparation:

### Loading the Data:

In [None]:
pip install -q openpyxl

In [None]:
import pandas as pd

df = pd.read_excel('./parallel-corpus.xlsx')

# Keep only the first two columns
df = df.iloc[:, :2]

df.rename(columns = {'SENTENCES ':'SENTENCES'}, inplace = True)

df = df.dropna()
df = df[df['SENTENCES'].str.len() > 3]

df.head()

Unnamed: 0,SENTENCES,MEANING
0,How can I communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,How can I make friends?’,میں دوست کیسے بنائوں ؟
2,Why do I get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"If you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"Depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Tokenize the sentences
tokenizer_eng = Tokenizer()
tokenizer_urdu = Tokenizer()

# Convert the 'SENTENCES' column to string type before fitting the tokenizer
df['SENTENCES'] = df['SENTENCES'].astype(str)
# Convert the 'MEANING' column to string type before fitting the tokenizer
df['MEANING'] = df['MEANING'].astype(str)

tokenizer_eng.fit_on_texts(df['SENTENCES'])
tokenizer_urdu.fit_on_texts(df['MEANING'])

eng_sequences = tokenizer_eng.texts_to_sequences(df['SENTENCES'])
urdu_sequences = tokenizer_urdu.texts_to_sequences(df['MEANING'])

# Pad sequences
max_len_eng = max(len(seq) for seq in eng_sequences)
max_len_urdu = max(len(seq) for seq in urdu_sequences)

max_len = max(max_len_eng,max_len_urdu)

eng_sequences = pad_sequences(eng_sequences, maxlen=max_len, padding='post')
urdu_sequences = pad_sequences(urdu_sequences, maxlen=max_len, padding='post')

# Vocabulary sizes
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_urdu = len(tokenizer_urdu.word_index) + 1

In [None]:
# Split the data into training, validation, and test sets
train_size = int(len(eng_sequences) * 0.7)
test_size = int(len(eng_sequences) * 0.15)

# For English (input) sequences
x_train, x_temp = eng_sequences[:train_size], eng_sequences[train_size:]
x_test, x_val = x_temp[:test_size], x_temp[test_size:]

# For Urdu (target) sequences
y_train, y_temp = urdu_sequences[:train_size], urdu_sequences[train_size:]
y_test, y_val = y_temp[:test_size], y_temp[test_size:]

# Prepare decoder input data (shifted by one position)
decoder_input_data = np.zeros_like(urdu_sequences)
decoder_input_data[:, 1:] = urdu_sequences[:, :-1]
decoder_input_data[:, 0] = tokenizer_urdu.word_index.get('<start>', 0)  # Use a start token if defined


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Encoder model
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=vocab_size_eng, output_dim=64)(encoder_inputs)
encoder_lstm = LSTM(64, return_state=True)
# The LSTM layer returns 3 values when return_state=True: output, hidden state, cell state
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]  # Include both hidden and cell states for the decoder

# Decoder model
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=vocab_size_urdu, output_dim=64)(decoder_inputs)
decoder_lstm = LSTM(64, return_sequences=True, return_state=True)
# Pass both hidden and cell states to the decoder
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_urdu, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
print(x_train.shape, y_train.shape)


(20647, 938) (20647, 938)


In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
# Train the model
model.fit(
    [x_train, decoder_input_data[:train_size]], y_train,
    epochs=50,
    validation_data=([x_val, decoder_input_data[train_size:train_size + len(x_val)]], y_val)
)

Epoch 1/50
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m308s[0m 468ms/step - accuracy: 0.9718 - loss: 2.2199 - val_accuracy: 0.9858 - val_loss: 0.1538
Epoch 2/50
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m319s[0m 469ms/step - accuracy: 0.9827 - loss: 0.1233 - val_accuracy: 0.9835 - val_loss: 0.1603
Epoch 3/50
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m305s[0m 472ms/step - accuracy: 0.9834 - loss: 0.1133 - val_accuracy: 0.9819 - val_loss: 0.1679
Epoch 4/50
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 473ms/step - accuracy: 0.9840 - loss: 0.1068 - val_accuracy: 0.9812 - val_loss: 0.1761
Epoch 5/50
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 469ms/step - accuracy: 0.9842 - loss: 0.1036 - val_accuracy: 0.9811 - val_loss: 0.1814
Epoch 6/50
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 470ms/step - accuracy: 0.9846 - loss: 0.0999 - val_accuracy: 0.9810 - val_loss: 0.1875
Epoc

<keras.src.callbacks.history.History at 0x7e9e4dfbf370>

In [9]:
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.metrics import accuracy_score

# Model to generate predictions (inference mode for encoder)
encoder_model = Model(encoder_inputs, encoder_states)

# Re-define the decoder model to use states from encoder during inference
decoder_state_input_h = Input(shape=(64,))
decoder_state_input_c = Input(shape=(64,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

# Function to convert sequence to words
def sequence_to_text(tokenizer, sequence):
    reverse_word_index = {index: word for word, index in tokenizer.word_index.items()}
    return [reverse_word_index.get(i, '<unk>') for i in sequence]

# Function to generate translation for an input sequence
def translate_sequence(input_seq):
    # Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)

    # Generate the start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_urdu.word_index.get('<start>', 0)

    stop_condition = False
    translated_sentence = []

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer_urdu.index_word.get(sampled_token_index, '<unk>')

        if sampled_word == '<end>' or len(translated_sentence) > max_len:
            stop_condition = True
        else:
            translated_sentence.append(sampled_word)

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return ' '.join(translated_sentence)

# Evaluate BLEU score on the test set
def evaluate_bleu_score(x_test, y_test):
    bleu_scores = []
    smooth_fn = SmoothingFunction().method1  # Smoothing for short sentences

    for i in range(len(x_test)):
        input_seq = x_test[i:i+1]
        predicted_translation = translate_sequence(input_seq)
        actual_translation = sequence_to_text(tokenizer_urdu, y_test[i])

        # Remove padding tokens and the start/end tokens
        predicted_translation = [word for word in predicted_translation.split() if word not in ['<start>', '<end>', '<unk>']]
        actual_translation = [word for word in actual_translation if word not in ['<start>', '<end>', '<unk>', '0']]

        # Compute BLEU score
        bleu_score = sentence_bleu([actual_translation], predicted_translation, smoothing_function=smooth_fn)
        bleu_scores.append(bleu_score)

    return np.mean(bleu_scores)

# Calculate the accuracy for predictions on the test set
def evaluate_accuracy(x_test, y_test):
    correct_predictions = 0
    total_predictions = 0

    for i in range(len(x_test)):
        input_seq = x_test[i:i+1]
        predicted_translation = translate_sequence(input_seq)
        actual_translation = sequence_to_text(tokenizer_urdu, y_test[i])

        # Remove padding tokens and the start/end tokens
        predicted_translation = [word for word in predicted_translation.split() if word not in ['<start>', '<end>', '<unk>']]
        actual_translation = [word for word in actual_translation if word not in ['<start>', '<end>', '<unk>', '0']]

        # Update correct and total predictions
        correct_predictions += sum(1 for p, a in zip(predicted_translation, actual_translation) if p == a)
        total_predictions += len(actual_translation)  # Consider the actual length as the base

    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy

# Evaluate BLEU score and accuracy on the test set
bleu_score = evaluate_bleu_score(x_test, y_test)
accuracy = evaluate_accuracy(x_test, y_test)

# Print the results
print(f'BLEU Score on Test Set: {bleu_score:.4f}')
print(f'Accuracy on Test Set: {accuracy:.4f}')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step

KeyboardInterrupt: 

In [None]:
# Translate function
def translate(text):
    sequence = tokenizer_eng.texts_to_sequences([text])
    sequence = pad_sequences(sequence, maxlen=max_len_eng, padding='post')
    prediction = model.predict(sequence)
    predicted_sequence = np.argmax(prediction, axis=-1)
    translated_text = ' '.join([tokenizer_urdu.index_word[idx] for idx in predicted_sequence[0] if idx != 0])
    return translated_text

In [None]:
for sentence in df.head()['SENTENCES']:
  print(sentence)
  print(translate(sentence))

In [10]:
model.save("english_urdu_LSTM_f219258.keras")