## Question 1 (15 Marks)

Build a RNN based seq2seq model which contains the following layers: (i) input layer for character embeddings (ii) one encoder RNN which sequentially encodes the input character sequence (Latin) (iii) one decoder RNN which takes the last state of the encoder as input and produces one output character at a time (Devanagari).

The code should be flexible such that the dimension of the input character embeddings, the hidden states of the encoders and decoders, the cell (RNN, LSTM, GRU) and the number of layers in the encoder and decoder can be changed.


In [5]:
import torch
import torch.nn as nn

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class Encoder(nn.Module):
    def __init__(self, input_vocab_size, embed_size, hidden_size, num_layers=1, cell_type="LSTM"):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_vocab_size, embed_size)
        rnn_class = {"RNN": nn.RNN, "LSTM": nn.LSTM, "GRU": nn.GRU}[cell_type]
        self.rnn = rnn_class(embed_size, hidden_size, num_layers, batch_first=True)
        self.cell_type = cell_type

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, hidden = self.rnn(embedded)
        return hidden


class Decoder(nn.Module):
    def __init__(self, output_vocab_size, embed_size, hidden_size, num_layers=1, cell_type="LSTM"):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_vocab_size, embed_size)
        rnn_class = {"RNN": nn.RNN, "LSTM": nn.LSTM, "GRU": nn.GRU}[cell_type]
        self.rnn = rnn_class(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_vocab_size)
        self.cell_type = cell_type

    def forward(self, x, hidden):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded, hidden)
        predictions = self.fc(output.squeeze(1))  # (batch_size, vocab_size)
        return predictions, hidden


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, cell_type="LSTM"):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.cell_type = cell_type

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size, target_len = target.size()
        vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, target_len, vocab_size).to(device)
        hidden = self.encoder(source)

        input = target[:, 0].unsqueeze(1)

        for t in range(1, target_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = target[:, t].unsqueeze(1) if teacher_force else top1.unsqueeze(1)

        return outputs


Using device: cuda:1


(a) What is the total number of computations done by your network? (assume that the input embedding size is m, encoder and decoder have 1 layer each, the hidden cell state is kkk for both the encoder and decoder, the length of the input and output sequence is the same, i.e., T, the size of the vocabulary is the same for the source and target language, i.e., V)

(b) What is the total number of parameters in your network? (assume that the input embedding size is M, encoder and decoder have 1 layer each, the hidden cell state is k for both the encoder and decoder and the length of the input and output sequence is the same, i.e., T, the size of the vocabulary is the same for the source and target language, i.e., V)



## Question 2 (10 Marks)
You will now train your model using any one language from the Dakshina dataset (I would suggest pick a language that you can read so that it is easy to analyse the errors). Use the standard train, dev, test set from the folder dakshina_dataset_v1.0/hi/lexicons/ (replace hi by the language of your choice)

Using the sweep feature in wandb find the best hyperparameter configuration. Here are some suggestions but you are free to decide which hyperparameters you want to explore


In [6]:
def build_vocab(filepaths):
    chars = set()
    for filepath in filepaths:
        with open(filepath, encoding="utf-8") as f:
            for line in f:
                native, roman, _ = line.strip().split("\t")
                chars.update(native)
                chars.update(roman)
    return chars

def make_char2idx(char_set):
    char_list = ["<pad>", "<sos>", "<eos>", "<unk>"] + sorted(list(char_set))
    return {char: idx for idx, char in enumerate(char_list)}, char_list

train_path = "/mnt/e_disk/ch24s016/da6401_assignment3/dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
dev_path = "/mnt/e_disk/ch24s016/da6401_assignment3/dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv"

char_set = build_vocab([train_path, dev_path])
roman2idx, idx2roman = make_char2idx(set(c for c in char_set if c.isascii()))
devanagari2idx, idx2devanagari = make_char2idx(set(c for c in char_set if not c.isascii()))

In [7]:
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=roman2idx["<pad>"], batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=devanagari2idx["<pad>"], batch_first=True)
    return src_batch, tgt_batch


In [8]:
import torch
from torch.utils.data import Dataset, DataLoader

class TransliterationDataset(Dataset):
    def __init__(self, tsv_path, src_char2idx, tgt_char2idx, max_len=32):
        self.pairs = []
        with open(tsv_path, encoding="utf-8") as f:
            for line in f:
                native, roman, _ = line.strip().split('\t')
                self.pairs.append((roman, native))

        self.src_c2i = src_char2idx
        self.tgt_c2i = tgt_char2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, i):
        roman, native = self.pairs[i]

        # map chars → indices, add <sos> / <eos> tokens as needed
        src_idxs = [self.src_c2i.get(c, self.src_c2i["<unk>"]) 
                    for c in roman][: self.max_len]
        tgt_idxs = [self.tgt_c2i["<sos>"]] + \
                   [self.tgt_c2i.get(c, self.tgt_c2i["<unk>"]) 
                    for c in native][: (self.max_len-1)] + \
                   [self.tgt_c2i["<eos>"]]

        return torch.tensor(src_idxs), torch.tensor(tgt_idxs)


def collate_fn(batch):
    src_seqs, tgt_seqs = zip(*batch)

    src_max_len = max(seq.size(0) for seq in src_seqs)
    tgt_max_len = max(seq.size(0) for seq in tgt_seqs)


    src_padded = torch.stack([
        torch.cat([seq, torch.full((src_max_len - len(seq),), roman2idx["<pad>"], dtype=torch.long)])
        for seq in src_seqs
    ])

    tgt_padded = torch.stack([
        torch.cat([seq, torch.full((tgt_max_len - len(seq),), devanagari2idx["<pad>"], dtype=torch.long)])
        for seq in tgt_seqs
    ])

    return src_padded, tgt_padded

train_ds = TransliterationDataset(
    "/mnt/e_disk/ch24s016/da6401_assignment3/dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv",
    src_char2idx=roman2idx,
    tgt_char2idx=devanagari2idx,
    max_len=32
)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=collate_fn)


train_dataset = TransliterationDataset(train_path, roman2idx, devanagari2idx, max_len=32)
dev_dataset = TransliterationDataset(dev_path, roman2idx, devanagari2idx, max_len=32)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)
dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim

# Hyperparameters

embed_size = 64
hidden_size = 128
num_layers = 1
cell_type = "LSTM"
batch_size = 32
epochs = 10
lr = 0.001

# Initialize model
encoder = Encoder(len(roman2idx), embed_size, hidden_size, num_layers, cell_type).to(device)
decoder = Decoder(len(devanagari2idx), embed_size, hidden_size, num_layers, cell_type).to(device)
model = Seq2Seq(encoder, decoder, cell_type).to(device)

# Optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=lr)
loss_function = nn.CrossEntropyLoss(ignore_index=devanagari2idx["<pad>"])

def evaluate_accuracy(model, dataloader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt, teacher_forcing_ratio=0.0)
            pred = output.argmax(dim=2)
            for i in range(tgt.size(0)):
                for j in range(1, tgt.size(1)):
                    if tgt[i, j].item() == devanagari2idx["<pad>"]:
                        break
                    if pred[i, j].item() == tgt[i, j].item():
                        correct += 1
                    total += 1
    return correct / total if total > 0 else 0.0


for epoch in range(epochs):
    model.train()
    total_loss = 0
    for src, tgt in train_loader:
        src, tgt = src.to(device), tgt.to(device)

        optimizer.zero_grad()
        output = model(src, tgt)
        output = output[:, 1:].reshape(-1, output.shape[-1])
        tgt_flat = tgt[:, 1:].reshape(-1)
        loss = loss_function(output, tgt_flat)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    train_acc = evaluate_accuracy(model, train_loader)
    dev_acc = evaluate_accuracy(model, dev_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Train Acc: {train_acc:.4f}, Dev Acc: {dev_acc:.4f}")

Epoch 1/10, Loss: 3127.1562, Train Acc: 0.4816, Dev Acc: 0.4805
Epoch 2/10, Loss: 1711.6280, Train Acc: 0.5946, Dev Acc: 0.5804
Epoch 3/10, Loss: 1352.0774, Train Acc: 0.6470, Dev Acc: 0.6213
Epoch 4/10, Loss: 1176.7243, Train Acc: 0.6764, Dev Acc: 0.6408
Epoch 5/10, Loss: 1063.9765, Train Acc: 0.7036, Dev Acc: 0.6505
Epoch 6/10, Loss: 979.7534, Train Acc: 0.7207, Dev Acc: 0.6639
Epoch 7/10, Loss: 919.6203, Train Acc: 0.7417, Dev Acc: 0.6652
Epoch 8/10, Loss: 864.9013, Train Acc: 0.7526, Dev Acc: 0.6724
Epoch 9/10, Loss: 820.9855, Train Acc: 0.7637, Dev Acc: 0.6734
Epoch 10/10, Loss: 781.2259, Train Acc: 0.7779, Dev Acc: 0.6752
