In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**QUESTION 4**

For the RNN-based translation model, Train the model twice:
1. Using randomly initialized embeddings for the English input.
2. Using pre-trained GloVe embeddings for the English input.
https://nlp.stanford.edu/projects/glove/
3. Compare the results between the two versions.
4. Discuss the impact of using pre-trained embeddings on model performance and
training time.

In [2]:
!pip install -q nltk
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
nltk.download('punkt')
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, LSTM, Dense, Bidirectional, GRU, MultiHeadAttention, LayerNormalization, Dropout, Add
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# Load the files
with open('/content/drive/MyDrive/Colab Notebooks/english-corpus.txt', 'r', encoding='utf-8') as f:
    english_lines = f.read().splitlines()
with open('/content/drive/MyDrive/Colab Notebooks/urdu-corpus.txt', 'r', encoding='utf-8') as f:
    urdu_lines = f.read().splitlines()

# Create DataFrame
df = pd.DataFrame({'english': english_lines, 'urdu': urdu_lines}).dropna()

# Add <sos> and <eos> tokens to Urdu (target)
df['urdu'] = df['urdu'].apply(lambda x: '<sos> ' + x + ' <eos>')

# Tokenization
eng_tokenizer = Tokenizer(oov_token='<OOV>', filters='')
urdu_tokenizer = Tokenizer(oov_token='<OOV>', filters='')

eng_tokenizer.fit_on_texts(df['english'])
urdu_tokenizer.fit_on_texts(df['urdu'])

# Convert to sequences
input_seq = eng_tokenizer.texts_to_sequences(df['english'])
target_seq = urdu_tokenizer.texts_to_sequences(df['urdu'])


# Padding lengths
max_input_len = max(len(seq) for seq in input_seq)
max_target_len = max(len(seq) for seq in target_seq)

# Pad encoder input
encoder_input = pad_sequences(input_seq, maxlen=max_input_len, padding='post')

# New decoder input/target processing
def preprocess_decoder_data(tokenized_urdu):
    decoder_input = []
    decoder_target = []

    for seq in tokenized_urdu:
        inp = seq[:-1]  # remove <eos>
        tar = seq[1:]   # remove <sos>
        decoder_input.append(inp + [0] * (max_target_len - len(inp)))
        decoder_target.append(tar + [0] * (max_target_len - len(tar)))

    return np.array(decoder_input), np.array(decoder_target)

# Apply preprocessing
decoder_input, decoder_target = preprocess_decoder_data(target_seq)



In [4]:
VOCAB_SIZE_EN = len(eng_tokenizer.word_index) + 1
VOCAB_SIZE_UR = len(urdu_tokenizer.word_index) + 1
EMBEDDING_DIM = 300
UNITS = 512
EPOCHS = 50
BATCH_SIZE = 64

In [5]:
def train_model(model, model_name):
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.fit(
        [encoder_input, decoder_input], decoder_target,
        batch_size=BATCH_SIZE, epochs=EPOCHS,
        validation_split=0.2
    )
    model.save(f"{model_name}_model.h5")

def decode_sequence_greedy(model, input_seq):
    target_seq = np.zeros((1, max_target_len))
    target_seq[0, 0] = urdu_tokenizer.word_index['<sos>']

    decoded_sentence = []
    for i in range(1, max_target_len):
        output_tokens = model.predict([input_seq, target_seq], verbose=0)
        sampled_token_index = np.argmax(output_tokens[0, i-1, :])
        sampled_word = urdu_tokenizer.index_word.get(sampled_token_index, '')
        if sampled_word == '<eos>' or sampled_word == '':
            break
        decoded_sentence.append(sampled_word)
        target_seq[0, i] = sampled_token_index
    return ' '.join(decoded_sentence)


from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
smoothie = SmoothingFunction().method4

def evaluate_bleu(model, sample_indices):
    bleu_scores = []

    for idx in sample_indices:
        input_seq = encoder_input[idx:idx+1]
        true_output = df['urdu'].iloc[idx].replace('<sos>', '').replace('<eos>', '').strip()
        pred_text = decode_sequence_greedy(model, input_seq)

        bleu = sentence_bleu([true_output.split()], pred_text.split(), smoothing_function=smoothie)
        bleu_scores.append(bleu)

        print(f"\n🔹Input: {df['english'].iloc[idx]}")
        print(f"🔹Target: {true_output}")
        print(f"🔹Predicted: {pred_text}")
        print(f"🔹BLEU Score: {bleu:.4f}")

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    print(f"\n✨ Average BLEU Score: {avg_bleu:.4f}")

In [6]:
def load_glove_embeddings(glove_path, embedding_dim):
    embeddings_index = {}
    with open(glove_path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_path = '/content/drive/MyDrive/Colab Notebooks/glove.6B.300d.txt'
EMBEDDING_DIM = 300  # Update global EMBEDDING_DIM if needed

glove_embeddings = load_glove_embeddings(glove_path, EMBEDDING_DIM)
embedding_matrix = np.zeros((VOCAB_SIZE_EN, EMBEDDING_DIM))


In [7]:
embedding_matrix = np.zeros((VOCAB_SIZE_EN, 300))  # 100 for glove.27B.100d

for word, i in eng_tokenizer.word_index.items():
    embedding_vector = glove_embeddings.get(word.lower())
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [8]:
from tensorflow.keras.layers import Input, Embedding, LSTM, SimpleRNN, Bidirectional, Dense, Concatenate, Attention, AdditiveAttention
from tensorflow.keras.models import Model

def build_seq2seq_with_attention(pretrained_embedding=None):
    # Encoder
    encoder_inputs = Input(shape=(max_input_len,))

    if pretrained_embedding is not None:
        encoder_embedding = Embedding(
            input_dim=VOCAB_SIZE_EN,
            output_dim=pretrained_embedding.shape[1],
            weights=[pretrained_embedding],
            input_length=max_input_len,
            trainable=True
        )(encoder_inputs)
    else:
        encoder_embedding = Embedding(VOCAB_SIZE_EN, EMBEDDING_DIM)(encoder_inputs)

    # Encoder LSTM
    encoder_outputs, state_h, state_c = LSTM(UNITS, return_sequences=True, return_state=True)(encoder_embedding)

    # Decoder
    decoder_inputs = Input(shape=(max_target_len,))
    decoder_embedding = Embedding(VOCAB_SIZE_UR, EMBEDDING_DIM)(decoder_inputs)
    decoder_lstm = LSTM(UNITS, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

    # Attention
    attention = AdditiveAttention()
    attention_result = attention([decoder_outputs, encoder_outputs])  # context vector

    # Concatenate context with decoder outputs
    concat = Concatenate(axis=-1)([decoder_outputs, attention_result])

    # Final Dense layer
    dense = Dense(VOCAB_SIZE_UR, activation='softmax')
    output = dense(concat)

    model = Model([encoder_inputs, decoder_inputs], output)
    return model


In [26]:
# GloVe: Pretrained Embeddings
rnn_model_glove = build_seq2seq_with_attention(pretrained_embedding=embedding_matrix)
train_model(rnn_model_glove, 'rnn_glove')
evaluate_bleu(rnn_model_glove, [10, 20, 30])



Epoch 1/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 81ms/step - accuracy: 0.7485 - loss: 2.0060 - val_accuracy: 0.8169 - val_loss: 1.1126
Epoch 2/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 76ms/step - accuracy: 0.8315 - loss: 0.9836 - val_accuracy: 0.8581 - val_loss: 0.8076
Epoch 3/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 75ms/step - accuracy: 0.8708 - loss: 0.6530 - val_accuracy: 0.8823 - val_loss: 0.6387
Epoch 4/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 76ms/step - accuracy: 0.9039 - loss: 0.4195 - val_accuracy: 0.8958 - val_loss: 0.5585
Epoch 5/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 75ms/step - accuracy: 0.9315 - loss: 0.2694 - val_accuracy: 0.9023 - val_loss: 0.5297
Epoch 6/50
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 76ms/step - accuracy: 0.9520 - loss: 0.1802 - val_accuracy: 0.9065 - val_loss: 0.5154
Epoch 7/50
[1m3




🔹Input: zain was hesitant
🔹Target: زین ہچکچا رہا تھا
🔹Predicted: زین ہچکچا رہا تھا
🔹BLEU Score: 1.0000

🔹Input: did zain give you that
🔹Target: زین نے تمہیں وہ دیا
🔹Predicted: زین نے تمہیں وہ دیا
🔹BLEU Score: 1.0000

🔹Input: i come from china
🔹Target: میں چین سے آیا ہوں۔
🔹Predicted: میں چین سے آیا ہوں۔
🔹BLEU Score: 1.0000

✨ Average BLEU Score: 1.0000
