In [4]:
import pandas as pd

data_path = '/kaggle/input/urdu-roman/urdu_roman.xlsx'

# Read the Excel file
df = pd.read_excel(data_path)



In [5]:
#Create array of [Urdu, Roman Urdu] pairs
data_array = df.values.tolist()

# Print first 5 elements to check
print(data_array[:5])

[['یوں تو نہ تیرے جسم میں ہیں زینہار ہاتھ', 'yuun to na tere jism men hain zinhar haath'], ['دینے کے اے کریم مگر ہیں ہزار ہاتھ', 'dene ke ai karim magar hain hazar haath'], ['انگڑائیوں میں پھیلتے ہیں بار بار ہاتھ', 'angdaiyon men phailte hain baar baar haath'], ['شیشہ کی سمت بڑھتے ہیں بے اختیار ہاتھ', 'shisha ki samt badhte hain be-ikhtiyar haath'], ['ڈوبے ہیں ترک سعی سے افسوس تو یہ ہے', 'duube hain tark-e-sai se afsos to ye hai']]


In [6]:
max_urdu_words = 0
max_roman_urdu_words = 0
max_urdu_text = ""
max_roman_urdu_text = ""

for urdu, roman_urdu in data_array:
    urdu_word_count = len(urdu.split())
    roman_urdu_word_count = len(roman_urdu.split())
    
    if urdu_word_count > max_urdu_words:
        max_urdu_words = urdu_word_count
        max_urdu_text = urdu
    
    if roman_urdu_word_count > max_roman_urdu_words:
        max_roman_urdu_words = roman_urdu_word_count
        max_roman_urdu_text = roman_urdu

print(f"Maximum Urdu word count: {max_urdu_words} (Text: {max_urdu_text})")
print(f"Maximum Roman Urdu word count: {max_roman_urdu_words} (Text: {max_roman_urdu_text})")

Maximum Urdu word count: 25 (Text: اب مجھ سے ہو تو ہو بھی کیا ہے ساتھ وہ تو وہ بھی کیا اک بے ہنر اک بے ثمر میں اور مری آوارگی)
Maximum Roman Urdu word count: 23 (Text: ye dil hi tha jo sah gaya vo baat aisi kah gaya kahne ko phir kya rah gaya ashkon ka dariya bah gaya)


In [7]:
# import re
# import json
# from collections import Counter, defaultdict

# # Flatten sentences
# all_sentences = [sent for pair in data_array for sent in pair]

# # -------------------------------
# # Initialize vocab
# # -------------------------------
# vocab = {"<pad>": 0, "<unk>": 1, "<sos>": 2, "<eos>": 3}
# vocab_size = 8000
# counter = Counter()

# # Step 2a: Split words into characters initially
# def split_to_chars(word):
#     return list(word)

# for sentence in all_sentences:
#     words = sentence.strip().split()
#     for word in words:
#         chars = split_to_chars(word)
#         chars.append('</w>')  # End of word symbol
#         counter.update([''.join(chars)])

# # -------------------------------
# # Build WordPiece merges
# # -------------------------------
# def get_stats(counter):
#     pairs = defaultdict(int)
#     for word, freq in counter.items():
#         symbols = word.split()
#         for i in range(len(symbols)-1):
#             pairs[symbols[i], symbols[i+1]] += freq
#     return pairs

# def merge_vocab(pair, counter):
#     bigram = ' '.join(pair)
#     new_counter = {}
#     for word in counter:
#         w_out = word.replace(bigram, ''.join(pair))
#         new_counter[w_out] = counter[word]
#     return new_counter

# # Start with each word as chars with spaces
# token_counter = {}
# for word in counter:
#     token_counter[' '.join(list(word))] = counter[word]

# while len(vocab) < vocab_size:
#     pairs = get_stats(token_counter)
#     if not pairs:
#         break
#     best = max(pairs, key=pairs.get)
#     token_counter = merge_vocab(best, token_counter)
#     token = ''.join(best)
#     if token not in vocab:
#         vocab[token] = len(vocab)

# # -------------------------------
# # Save tokenizer
# # -------------------------------
# with open("wordpiece_vocab.json", "w", encoding="utf-8") as f:
#     json.dump(vocab, f, ensure_ascii=False, indent=2)

# print("Tokenizer saved. Vocab size:", len(vocab))


In [8]:
import json

with open("wordpiece_vocab.json", "r", encoding="utf-8") as f:
    vocab = json.load(f)

inv_vocab = {v: k for k, v in vocab.items()}
unk_id = vocab["<unk>"]
pad_id = vocab["<pad>"]
sos_id = vocab["<sos>"]
eos_id = vocab["<eos>"]


In [9]:
def wordpiece_tokenize(word, vocab):
    # greedy longest-match-first
    chars = list(word)
    chars.append('</w>')
    tokens = []
    i = 0
    while i < len(chars):
        j = len(chars)
        while j > i:
            piece = ''.join(chars[i:j])
            if piece in vocab:
                tokens.append(vocab[piece])
                i = j
                break
            j -= 1
        else:
            tokens.append(unk_id)
            i += 1
    return tokens

def sentence_to_ids(sentence, vocab):
    tokens = []
    for word in sentence.strip().split():
        tokens.extend(wordpiece_tokenize(word, vocab))
    tokens.append(eos_id)
    return tokens

# Example
urdu_ids = [sentence_to_ids(pair[0], vocab) for pair in data_array]
roman_ids = [sentence_to_ids(pair[1], vocab) for pair in data_array]


In [10]:
def pad_sequences(sequences, pad_id, max_len=None):
    if max_len is None:
        max_len = max(len(seq) for seq in sequences)
    padded = []
    for seq in sequences:
        padded.append(seq + [pad_id]*(max_len - len(seq)))
    return padded

max_len_input = max(len(seq) for seq in urdu_ids)
max_len_output = max(len(seq) for seq in roman_ids)

encoder_input = pad_sequences(urdu_ids, pad_id, max_len_input)
decoder_output = pad_sequences(roman_ids, pad_id, max_len_output)
# print(encoder_input)
# print(decoder_output)

In [11]:
import torch

encoder_input = torch.tensor(encoder_input, dtype=torch.long)
decoder_output = torch.tensor(decoder_output, dtype=torch.long)
print(encoder_input)
print(decoder_output)

tensor([[ 388,   77,   74,  ...,    0,    0,    0],
        [2457,   62,  264,  ...,    0,    0,    0],
        [3588, 4297,  388,  ...,    0,    0,    0],
        ...,
        [ 125,  403, 1017,  ...,    0,    0,    0],
        [2445, 1693, 4181,  ...,    0,    0,    0],
        [ 186, 1466,    7,  ...,    0,    0,    0]])
tensor([[ 253,   97,   41,  ...,    0,    0,    0],
        [2458,   63,  265,  ...,    0,    0,    0],
        [ 257,    1,   72,  ...,    0,    0,    0],
        ...,
        [ 117,  404,  613,  ...,    0,    0,    0],
        [ 650,  238,  225,  ...,    0,    0,    0],
        [ 187,  559,  140,  ...,    0,    0,    0]])


In [12]:
import torch.nn as nn

class Seq2Seq(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        
        # Encoder: 2-layer bidirectional LSTM
        self.encoder = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=2,
            batch_first=True,
            bidirectional=True
        )
        
        # Decoder: 4-layer LSTM
        self.decoder = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim*2,  # because encoder is bidirectional
            num_layers=4,
            batch_first=True
        )
        
        # Fully connected layer to vocab
        self.fc = nn.Linear(hidden_dim*2, vocab_size)

    def forward(self, src, tgt):
        # Embed input sequences
        src_embed = self.embedding(src)
        tgt_embed = self.embedding(tgt)
        
        # Encoder
        enc_out, (hidden, cell) = self.encoder(src_embed)
        
        # Concatenate bidirectional hidden states for decoder initial state
        hidden = torch.cat([hidden[-2], hidden[-1]], dim=1).unsqueeze(0).repeat(4, 1, 1)
        cell = torch.cat([cell[-2], cell[-1]], dim=1).unsqueeze(0).repeat(4, 1, 1)
        
        # Decoder
        dec_out, _ = self.decoder(tgt_embed, (hidden, cell))
        
        # Output logits
        logits = self.fc(dec_out)
        return logits

vocab_size = len(vocab)
embed_dim = 128
hidden_dim = 256

model = Seq2Seq(vocab_size, embed_dim, hidden_dim, pad_id)
print(model)


Seq2Seq(
  (embedding): Embedding(8000, 128, padding_idx=0)
  (encoder): LSTM(128, 256, num_layers=2, batch_first=True, bidirectional=True)
  (decoder): LSTM(128, 512, num_layers=4, batch_first=True)
  (fc): Linear(in_features=512, out_features=8000, bias=True)
)


In [None]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Prepare data with special tokens

urdu_sentences = ["<sos> " + row[0] + " <eos>" for row in data_array]
roman_sentences = ["<sos> " + row[1] + " <eos>" for row in data_array]

# Split dataset: 50% train, 25% val, 25% test
train_temp, test = train_test_split(data_array, test_size=0.5, random_state=12)
train, val = train_test_split(train_temp, test_size=0.5, random_state=12)

train = np.array(train)
val   = np.array(val)
test  = np.array(test)

# Step 3: Roman Urdu tokenizer (WordPiece from scratch)

def build_wordpiece_vocab(sentences, vocab_size=8000, min_freq=2):
    # Initialize vocab with special tokens
    vocab = {"<pad>": 0, "<unk>": 1, "<sos>": 2, "<eos>": 3}
    idx = 4

    # Count all character sequences
    freq = {}
    for sent in sentences:
        for word in sent.split():
            word = word + "</w>"  # End-of-word marker
            for i in range(len(word)):
                for j in range(i+1, len(word)+1):
                    subword = word[i:j]
                    freq[subword] = freq.get(subword, 0) + 1

    # Keep frequent subwords
    for subword, _ in sorted(freq.items(), key=lambda x: -x[1]):
        if subword not in vocab and freq[subword] >= min_freq:
            vocab[subword] = idx
            idx += 1
        if idx >= vocab_size:
            break

    return vocab

roman_word2idx = build_wordpiece_vocab(roman_sentences, vocab_size=8000)

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Convert NumPy arrays to PyTorch tensors
trainX_t = torch.tensor(trainX, dtype=torch.long)
trainY_t = torch.tensor(trainY, dtype=torch.long)
valX_t = torch.tensor(valX, dtype=torch.long)
valY_t = torch.tensor(valY, dtype=torch.long)
testX_t = torch.tensor(testX, dtype=torch.long)
testY_t = torch.tensor(testY, dtype=torch.long)

# Create TensorDatasets
train_dataset = TensorDataset(trainX_t, trainY_t)
val_dataset = TensorDataset(valX_t, valY_t)
test_dataset = TensorDataset(testX_t, testY_t)

# Define DataLoaders
batch_size = 32  # you can adjust
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
# Check shapes of one batch from each loader
for batchX, batchY in train_loader:
    print("Train batch:")
    print("  X shape:", batchX.shape)  # [batch_size, max_src_len]
    print("  Y shape:", batchY.shape)  # [batch_size, max_tgt_len]
    break

for batchX, batchY in val_loader:
    print("Validation batch:")
    print("  X shape:", batchX.shape)
    print("  Y shape:", batchY.shape)
    break

for batchX, batchY in test_loader:
    print("Test batch:")
    print("  X shape:", batchX.shape)
    print("  Y shape:", batchY.shape)
    break


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import math
from nltk.translate.bleu_score import corpus_bleu
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
SRC_VOCAB = urdu_vocab_size
TRG_VOCAB = roman_vocab_size
EMB_SIZE = 256
ENC_HID = 256
DEC_HID = 256
ENC_LAYERS = 2  
DEC_LAYERS = 4  
DROPOUT = 0.2
BATCH_SIZE = 32
LR = 1e-4
EPOCHS = 5
TEACHER_FORCING_RATIO = 0.9
PAD_IDX = 0

# Reverse mapping for tokenizer to convert predicted ids to tokens
index2word_trg = {i: w for w, i in roman_tokenizer.word_index.items()}
index2word_trg[0] = ""  # pad

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=PAD_IDX)
        self.rnn = nn.LSTM(
            emb_dim,
            hid_dim,
            num_layers=n_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout if n_layers > 1 else 0.0
        )
        self.dropout = nn.Dropout(dropout)
        # If bidirectional and n_layers, decoder initial hidden should combine directions
        self.fc_hidden = nn.Linear(hid_dim * 2, hid_dim)
        self.fc_cell = nn.Linear(hid_dim * 2, hid_dim)

    def forward(self, src, src_lengths=None):
        # src: [batch, src_len]
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs: [batch, src_len, hid*2]
        # hidden/cell: [n_layers*2, batch, hid]
        # We will combine the last layer forward and backward for initialization
        # Take the last layer pair
        # hidden[-2] is forward, hidden[-1] is backward for last layer
        forward_h = hidden[-2,:,:]
        backward_h = hidden[-1,:,:]
        forward_c = cell[-2,:,:]
        backward_c = cell[-1,:,:]
        hid_init = torch.tanh(self.fc_hidden(torch.cat((forward_h, backward_h), dim=1)))
        cell_init = torch.tanh(self.fc_cell(torch.cat((forward_c, backward_c), dim=1)))
        # Expand to decoder layers dimension
        # decoder expects n_layers, so tile the single vector across DEC_LAYERS
        hid_init = hid_init.unsqueeze(0).repeat(DEC_LAYERS, 1, 1)  # [dec_layers, batch, hid]
        cell_init = cell_init.unsqueeze(0).repeat(DEC_LAYERS, 1, 1)
        return outputs, (hid_init, cell_init)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=PAD_IDX)
        self.rnn = nn.LSTM(
            emb_dim,
            hid_dim,
            num_layers=n_layers,
            batch_first=True,
            dropout=dropout if n_layers > 1 else 0.0
        )
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_step, hidden, cell):
        # input_step: [batch] of token ids for current time step
        input_step = input_step.unsqueeze(1)  # [batch, 1]
        embedded = self.dropout(self.embedding(input_step))  # [batch, 1, emb]
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # output: [batch, 1, hid]
        prediction = self.fc_out(output.squeeze(1))  # [batch, output_dim]
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg=None, teacher_forcing_ratio=0.5, max_len=None):
        # src: [batch, src_len]
        # trg: [batch, trg_len] or None for inference
        batch_size = src.size(0)
        if max_len is None:
            max_len = trg.size(1) if trg is not None else 25
        outputs = torch.zeros(batch_size, max_len, self.decoder.output_dim).to(self.device)
        encoder_outputs, (hidden, cell) = self.encoder(src)
        # first input token for decoder: usually start token.
        # Keras Tokenizer does not have start token by default, so use PAD_IDX as placeholder.
        # If you have a start token id, replace PAD_IDX below.
        input_tok = torch.full((batch_size,), PAD_IDX, dtype=torch.long, device=self.device)
        for t in range(0, max_len):
            preds, hidden, cell = self.decoder(input_tok, hidden, cell)
            outputs[:, t, :] = preds
            if trg is not None and torch.rand(1).item() < teacher_forcing_ratio:
                input_tok = trg[:, t]
            else:
                input_tok = preds.argmax(1)
        return outputs

# Build model
enc = Encoder(SRC_VOCAB, EMB_SIZE, ENC_HID, ENC_LAYERS, DROPOUT)
dec = Decoder(TRG_VOCAB, EMB_SIZE, DEC_HID, DEC_LAYERS, DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)

optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

# Utility: translate predicted ids to token list
def ids_to_tokens(id_list):
    tokens = []
    for idx in id_list:
        if idx == PAD_IDX:
            break
        token = index2word_trg.get(int(idx), "")
        if token == "":
            # unknown token or pad
            continue
        tokens.append(token.split())  # tokenizer is word-level, split to words
    # flatten token lists if nested
    flat = []
    for t in tokens:
        flat.extend(t)
    return flat

# Greedy decode for evaluation
def greedy_decode(model, src, max_len=25):
    model.eval()
    with torch.no_grad():
        batch_size = src.size(0)
        encoder_outputs, (hidden, cell) = model.encoder(src)
        input_tok = torch.full((batch_size,), PAD_IDX, dtype=torch.long, device=device)
        outputs = []
        for t in range(max_len):
            preds, hidden, cell = model.decoder(input_tok, hidden, cell)
            input_tok = preds.argmax(1)
            outputs.append(input_tok.unsqueeze(1))
        outputs = torch.cat(outputs, dim=1)  # [batch, max_len]
    return outputs

# BLEU preparation: convert batch of token ids to lists of tokens
def batch_ids_to_texts(batch_ids):
    texts = []
    for seq in batch_ids:
        words = []
        for idx in seq.cpu().numpy():
            if idx == PAD_IDX:
                break
            w = index2word_trg.get(int(idx), "")
            if w != "":
                words.append(w)
        texts.append(words)
    return texts

# Training and evaluation loops
def run_epoch(loader, training=True):
    epoch_loss = 0
    all_refs = []
    all_hyps = []
    if training:
        model.train()
    else:
        model.eval()
    for src_batch, trg_batch in loader:
        src_batch = src_batch.to(device)
        trg_batch = trg_batch.to(device)
        if training:
            optimizer.zero_grad()
        outputs = model(src_batch, trg_batch, teacher_forcing_ratio=TEACHER_FORCING_RATIO)
        # outputs: [batch, trg_len, vocab]
        output_dim = outputs.size(-1)
        outputs_flat = outputs.view(-1, output_dim)
        trg_flat = trg_batch.view(-1)
        loss = criterion(outputs_flat, trg_flat)
        if training:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
        epoch_loss += loss.item() * src_batch.size(0)  # sum over batch

        # collect for BLEU
        with torch.no_grad():
            preds_ids = outputs.argmax(-1)  # [batch, trg_len]
            refs = batch_ids_to_texts(trg_batch)
            hyps = batch_ids_to_texts(preds_ids)
            # corpus_bleu expects list of list of references
            for r, h in zip(refs, hyps):
                if len(r) == 0:
                    r = [""]  # avoid empty
                all_refs.append([r])
                all_hyps.append(h)

    total_tokens = len(loader.dataset)
    avg_loss = epoch_loss / total_tokens
    ppl = math.exp(avg_loss) if avg_loss < 700 else float("inf")
    # compute corpus BLEU
    try:
        bleu = corpus_bleu(all_refs, all_hyps)
    except Exception:
        bleu = 0.0
    return avg_loss, ppl, bleu

# Training loop
for epoch in range(1, EPOCHS + 1):
    train_loss, train_ppl, train_bleu = run_epoch(train_loader, training=True)
    val_loss, val_ppl, val_bleu = run_epoch(val_loader, training=False)
    test_loss, test_ppl, test_bleu = run_epoch(test_loader, training=False)

    print(f"Epoch {epoch}")
    print(f"Train  Loss {train_loss:.4f}  PPL {train_ppl:.4f}  BLEU {train_bleu:.4f}")
    print(f"Val    Loss {val_loss:.4f}  PPL {val_ppl:.4f}  BLEU {val_bleu:.4f}")
    print(f"Test   Loss {test_loss:.4f}  PPL {test_ppl:.4f}  BLEU {test_bleu:.4f}")
