## Question 1 (15 Marks)

Build a RNN based seq2seq model which contains the following layers: (i) input layer for character embeddings (ii) one encoder RNN which sequentially encodes the input character sequence (Latin) (iii) one decoder RNN which takes the last state of the encoder as input and produces one output character at a time (Devanagari).

The code should be flexible such that the dimension of the input character embeddings, the hidden states of the encoders and decoders, the cell (RNN, LSTM, GRU) and the number of layers in the encoder and decoder can be changed.


In [1]:
import torch
import torch.nn as nn

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class Encoder(nn.Module):
    def __init__(self, input_vocab_size, embed_size, hidden_size, num_layers=1, cell_type="LSTM"):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_vocab_size, embed_size)
        rnn_class = {"RNN": nn.RNN, "LSTM": nn.LSTM, "GRU": nn.GRU}[cell_type]
        self.rnn = rnn_class(embed_size, hidden_size, num_layers, batch_first=True)
        self.cell_type = cell_type

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, hidden = self.rnn(embedded)
        return hidden


class Decoder(nn.Module):
    def __init__(self, output_vocab_size, embed_size, hidden_size, num_layers=1, cell_type="LSTM"):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_vocab_size, embed_size)
        rnn_class = {"RNN": nn.RNN, "LSTM": nn.LSTM, "GRU": nn.GRU}[cell_type]
        self.rnn = rnn_class(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_vocab_size)
        self.cell_type = cell_type

    def forward(self, x, hidden):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded, hidden)
        predictions = self.fc(output.squeeze(1))  # (batch_size, vocab_size)
        return predictions, hidden


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, cell_type="LSTM"):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.cell_type = cell_type

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size, target_len = target.size()
        vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, target_len, vocab_size).to(device)
        hidden = self.encoder(source)

        input = target[:, 0].unsqueeze(1)

        for t in range(1, target_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = target[:, t].unsqueeze(1) if teacher_force else top1.unsqueeze(1)

        return outputs


  cpu = _conversion_method_template(device=torch.device("cpu"))


Using device: cuda:1


(a) What is the total number of computations done by your network? (assume that the input embedding size is m, encoder and decoder have 1 layer each, the hidden cell state is kkk for both the encoder and decoder, the length of the input and output sequence is the same, i.e., T, the size of the vocabulary is the same for the source and target language, i.e., V)

(b) What is the total number of parameters in your network? (assume that the input embedding size is M, encoder and decoder have 1 layer each, the hidden cell state is k for both the encoder and decoder and the length of the input and output sequence is the same, i.e., T, the size of the vocabulary is the same for the source and target language, i.e., V)



## Question 2 (10 Marks)
You will now train your model using any one language from the Dakshina dataset (I would suggest pick a language that you can read so that it is easy to analyse the errors). Use the standard train, dev, test set from the folder dakshina_dataset_v1.0/hi/lexicons/ (replace hi by the language of your choice)

Using the sweep feature in wandb find the best hyperparameter configuration. Here are some suggestions but you are free to decide which hyperparameters you want to explore


In [2]:
def build_vocab(filepaths):
    chars = set()
    for filepath in filepaths:
        with open(filepath, encoding="utf-8") as f:
            for line in f:
                native, roman, _ = line.strip().split("\t")
                chars.update(native)
                chars.update(roman)
    return chars

def make_char2idx(char_set):
    char_list = ["<pad>", "<sos>", "<eos>", "<unk>"] + sorted(list(char_set))
    return {char: idx for idx, char in enumerate(char_list)}, char_list

train_path = "/mnt/e_disk/ch24s016/da6401_assignment3/dataset/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv"
dev_path = "/mnt/e_disk/ch24s016/da6401_assignment3/dataset/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.dev.tsv"

char_set = build_vocab([train_path, dev_path])
roman2idx, idx2roman = make_char2idx(set(c for c in char_set if c.isascii()))
devanagari2idx, idx2devanagari = make_char2idx(set(c for c in char_set if not c.isascii()))

In [4]:
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=roman2idx["<pad>"], batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=devanagari2idx["<pad>"], batch_first=True)
    return src_batch, tgt_batch


In [5]:
import torch
from torch.utils.data import Dataset, DataLoader

class TransliterationDataset(Dataset):
    def __init__(self, tsv_path, src_char2idx, tgt_char2idx, max_len=32):
        self.pairs = []
        with open(tsv_path, encoding="utf-8") as f:
            for line in f:
                native, roman, _ = line.strip().split('\t')
                self.pairs.append((roman, native))

        self.src_c2i = src_char2idx
        self.tgt_c2i = tgt_char2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, i):
        roman, native = self.pairs[i]

        # map chars → indices, add <sos> / <eos> tokens as needed
        src_idxs = [self.src_c2i.get(c, self.src_c2i["<unk>"]) 
                    for c in roman][: self.max_len]
        tgt_idxs = [self.tgt_c2i["<sos>"]] + \
                   [self.tgt_c2i.get(c, self.tgt_c2i["<unk>"]) 
                    for c in native][: (self.max_len-1)] + \
                   [self.tgt_c2i["<eos>"]]

        return torch.tensor(src_idxs), torch.tensor(tgt_idxs)


def collate_fn(batch):
    src_seqs, tgt_seqs = zip(*batch)

    src_max_len = max(seq.size(0) for seq in src_seqs)
    tgt_max_len = max(seq.size(0) for seq in tgt_seqs)


    src_padded = torch.stack([
        torch.cat([seq, torch.full((src_max_len - len(seq),), roman2idx["<pad>"], dtype=torch.long)])
        for seq in src_seqs
    ])

    tgt_padded = torch.stack([
        torch.cat([seq, torch.full((tgt_max_len - len(seq),), devanagari2idx["<pad>"], dtype=torch.long)])
        for seq in tgt_seqs
    ])

    return src_padded, tgt_padded

train_ds = TransliterationDataset(
    "/mnt/e_disk/ch24s016/da6401_assignment3/dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv",
    src_char2idx=roman2idx,
    tgt_char2idx=devanagari2idx,
    max_len=32
)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=collate_fn)


train_dataset = TransliterationDataset(train_path, roman2idx, devanagari2idx, max_len=32)
dev_dataset = TransliterationDataset(dev_path, roman2idx, devanagari2idx, max_len=32)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)
dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)


In [12]:
import torch
import torch.nn as nn
import torch.optim as optim

# Hyperparameters

embed_size = 64
hidden_size = 128
num_layers = 1
cell_type = "LSTM"
batch_size = 32
epochs = 10
lr = 0.001

# Initialize model
encoder = Encoder(len(roman2idx), embed_size, hidden_size, num_layers, cell_type).to(device)
decoder = Decoder(len(devanagari2idx), embed_size, hidden_size, num_layers, cell_type).to(device)
model = Seq2Seq(encoder, decoder, cell_type).to(device)

# Optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=lr)
loss_function = nn.CrossEntropyLoss(ignore_index=devanagari2idx["<pad>"])

def evaluate_accuracy(model, dataloader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt, teacher_forcing_ratio=0.0)
            pred = output.argmax(dim=2)
            for i in range(tgt.size(0)):
                for j in range(1, tgt.size(1)):
                    if tgt[i, j].item() == devanagari2idx["<pad>"]:
                        break
                    if pred[i, j].item() == tgt[i, j].item():
                        correct += 1
                    total += 1
    return correct / total if total > 0 else 0.0


for epoch in range(epochs):
    model.train()
    total_loss = 0
    for src, tgt in train_loader:
        src, tgt = src.to(device), tgt.to(device)

        optimizer.zero_grad()
        output = model(src, tgt)
        output = output[:, 1:].reshape(-1, output.shape[-1])
        tgt_flat = tgt[:, 1:].reshape(-1)
        loss = loss_function(output, tgt_flat)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    train_acc = evaluate_accuracy(model, train_loader)
    dev_acc = evaluate_accuracy(model, dev_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {dev_acc:.4f}")

Epoch 1/10, Loss: 3734.5742, Train Acc: 0.6121, Val Acc: 0.5876
Epoch 2/10, Loss: 1666.9953, Train Acc: 0.7547, Val Acc: 0.7093
Epoch 3/10, Loss: 1224.1711, Train Acc: 0.8027, Val Acc: 0.7403
Epoch 4/10, Loss: 1031.5823, Train Acc: 0.8336, Val Acc: 0.7648
Epoch 5/10, Loss: 900.4773, Train Acc: 0.8582, Val Acc: 0.7813
Epoch 6/10, Loss: 821.7708, Train Acc: 0.8678, Val Acc: 0.7822
Epoch 7/10, Loss: 747.3668, Train Acc: 0.8779, Val Acc: 0.7852
Epoch 8/10, Loss: 694.8574, Train Acc: 0.8881, Val Acc: 0.7858
Epoch 9/10, Loss: 650.6924, Train Acc: 0.8960, Val Acc: 0.7929
Epoch 10/10, Loss: 619.3768, Train Acc: 0.9005, Val Acc: 0.7892


## Wandb Sweep Run to Find Best Hyperparameter

In [6]:
import wandb
import os

sweep_config = {
    'method': 'bayes',
    'name': 'Seq2Seq Transliteration Sweep',
    'metric': {'name': "val_accuracy", 'goal': 'maximize'},
    'parameters': {
        'embed_size': {'values': [32, 64, 128]},
        'hidden_size': {'values': [64, 128, 256]},
        'num_layers': {'values': [1]},
        'cell_type': {'values': ['RNN', 'GRU', 'LSTM']},
        'optimizer': {'values': ['adam', 'adamw', 'sgd']},
        'lr': {'values': [0.01, 0.001, 0.0005]},
        'batch_size': {'values': [16, 32, 64]},
        'epochs': {'values': [5, 10]}
    },
}

def evaluate_accuracy(model, dataloader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt, teacher_forcing_ratio=0.0)
            pred = output.argmax(dim=2)
            for i in range(tgt.size(0)):
                for j in range(1, tgt.size(1)):
                    if tgt[i, j].item() == devanagari2idx["<pad>"]:
                        break
                    if pred[i, j].item() == tgt[i, j].item():
                        correct += 1
                    total += 1
    return correct / total if total > 0 else 0.0

def train_sweep():
    wandb.init()
    config = wandb.config

    # Update hyperparameters from sweep config
    embed_size = config.embed_size
    hidden_size = config.hidden_size
    num_layers = config.num_layers
    cell_type = config.cell_type
    batch_size = config.batch_size
    epochs = config.epochs
    lr = config.lr

    # Update data loader if batch_size changes
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

    # Model setup
    encoder = Encoder(len(roman2idx), embed_size, hidden_size, num_layers, cell_type).to(device)
    decoder = Decoder(len(devanagari2idx), embed_size, hidden_size, num_layers, cell_type).to(device)
    model = Seq2Seq(encoder, decoder, cell_type).to(device)

    # Optimizer
    if config.optimizer == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    elif config.optimizer == 'adamw':
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    # 
    # elif config.optimizer == 'sgd':
    #     optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    # elif config.optimizer == 'rmsprop':
    #     optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)
    else:
        raise ValueError("Unsupported optimizer")

    # Loss function
    loss_function = nn.CrossEntropyLoss(ignore_index=devanagari2idx["<pad>"])

    # Training loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for src, tgt in train_loader:
            src, tgt = src.to(device), tgt.to(device)
            optimizer.zero_grad()
            output = model(src, tgt)
            output = output[:, 1:].reshape(-1, output.shape[-1])
            tgt_flat = tgt[:, 1:].reshape(-1)
            loss = loss_function(output, tgt_flat)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        train_acc = evaluate_accuracy(model, train_loader)
        val_acc = evaluate_accuracy(model, dev_loader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")
        wandb.log({
            "epoch": epoch + 1,
            "loss": total_loss,
            "train_accuracy": train_acc,
            "val_accuracy": val_acc
        })
        
    model_dir = "./trained_models"
    os.makedirs(model_dir, exist_ok=True)

    # Unique file name using wandb run name or ID
    run_id = wandb.run.name  # or wandb.run.id
    model_path = os.path.join(model_dir, f"model_{run_id}.pt")
    torch.save(model.state_dict(), model_path)

    print(f"Model saved to {model_path}")

    wandb.finish()
    
sweep_id = wandb.sweep(sweep_config, project="Seq2SeqAssignment3")
wandb.agent(sweep_id, function=train_sweep, count=10)

Create sweep with ID: gy0rcmqo
Sweep URL: https://wandb.ai/ch24s016-iitm/Seq2SeqAssignment3/sweeps/gy0rcmqo


[34m[1mwandb[0m: Agent Starting Run: tidwg9cp with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: Currently logged in as: [33mch24s016[0m ([33mch24s016-iitm[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/10, Loss: 5304.8279, Train Acc: 0.1832, Val Acc: 0.1831
Epoch 2/10, Loss: 5081.6520, Train Acc: 0.1621, Val Acc: 0.1320
Epoch 3/10, Loss: 4954.2290, Train Acc: 0.1751, Val Acc: 0.1641
Epoch 4/10, Loss: 4881.5742, Train Acc: 0.1910, Val Acc: 0.1687
Epoch 5/10, Loss: 4850.3568, Train Acc: 0.1862, Val Acc: 0.1599
Epoch 6/10, Loss: 4842.0607, Train Acc: 0.1910, Val Acc: 0.1791
Epoch 7/10, Loss: 4775.6646, Train Acc: 0.1962, Val Acc: 0.1757
Epoch 8/10, Loss: 4798.3265, Train Acc: 0.1768, Val Acc: 0.1628
Epoch 9/10, Loss: 4798.1818, Train Acc: 0.1846, Val Acc: 0.1805
Epoch 10/10, Loss: 4817.1717, Train Acc: 0.1957, Val Acc: 0.1781
Model saved to ./trained_models/model_whole-sweep-1.pt


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▅▃▂▂▂▁▁▁▂
train_accuracy,▅▁▄▇▆▇█▄▆█
val_accuracy,█▁▅▆▅▇▇▅█▇

0,1
epoch,10.0
loss,4817.17166
train_accuracy,0.19567
val_accuracy,0.17806


[34m[1mwandb[0m: Agent Starting Run: wod6ukdz with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	embed_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: adamw


Epoch 1/5, Loss: 2627.3083, Train Acc: 0.6320, Val Acc: 0.6144
Epoch 2/5, Loss: 1800.1574, Train Acc: 0.6844, Val Acc: 0.6539
Epoch 3/5, Loss: 1617.1992, Train Acc: 0.7126, Val Acc: 0.6816
Epoch 4/5, Loss: 1507.5024, Train Acc: 0.7283, Val Acc: 0.6831
Epoch 5/5, Loss: 1446.1775, Train Acc: 0.7350, Val Acc: 0.6927
Model saved to ./trained_models/model_electric-sweep-2.pt


0,1
epoch,▁▃▅▆█
loss,█▃▂▁▁
train_accuracy,▁▅▆██
val_accuracy,▁▅▇▇█

0,1
epoch,5.0
loss,1446.17755
train_accuracy,0.73504
val_accuracy,0.69271


[34m[1mwandb[0m: Agent Starting Run: skauucbb with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: sgd


[34m[1mwandb[0m: [32m[41mERROR[0m Run skauucbb errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/mnt/e_disk/ch24s016/da6401_assignment3/.venv/lib/python3.10/site-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_3752887/2263630927.py", line 70, in train_sweep
[34m[1mwandb[0m: [32m[41mERROR[0m     raise ValueError("Unsupported optimizer")
[34m[1mwandb[0m: [32m[41mERROR[0m ValueError: Unsupported optimizer
[34m[1mwandb[0m: [32m[41mERROR[0m 
[34m[1mwandb[0m: Agent Starting Run: 17b57hme with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb

[34m[1mwandb[0m: [32m[41mERROR[0m Run 17b57hme errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/mnt/e_disk/ch24s016/da6401_assignment3/.venv/lib/python3.10/site-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_3752887/2263630927.py", line 70, in train_sweep
[34m[1mwandb[0m: [32m[41mERROR[0m     raise ValueError("Unsupported optimizer")
[34m[1mwandb[0m: [32m[41mERROR[0m ValueError: Unsupported optimizer
[34m[1mwandb[0m: [32m[41mERROR[0m 
[34m[1mwandb[0m: Agent Starting Run: qdmwzeam with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	embed_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	lr: 0.0005
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwan

[34m[1mwandb[0m: [32m[41mERROR[0m Run qdmwzeam errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/mnt/e_disk/ch24s016/da6401_assignment3/.venv/lib/python3.10/site-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_3752887/2263630927.py", line 70, in train_sweep
[34m[1mwandb[0m: [32m[41mERROR[0m     raise ValueError("Unsupported optimizer")
[34m[1mwandb[0m: [32m[41mERROR[0m ValueError: Unsupported optimizer
[34m[1mwandb[0m: [32m[41mERROR[0m 
[34m[1mwandb[0m: Agent Starting Run: znri238s with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb

Epoch 1/5, Loss: 4549.6156, Train Acc: 0.3918, Val Acc: 0.3865
Epoch 2/5, Loss: 2932.7173, Train Acc: 0.5204, Val Acc: 0.4976
Epoch 3/5, Loss: 2372.2580, Train Acc: 0.5759, Val Acc: 0.5430
Epoch 4/5, Loss: 2098.8817, Train Acc: 0.6185, Val Acc: 0.5858
Epoch 5/5, Loss: 1942.2648, Train Acc: 0.6489, Val Acc: 0.6099
Model saved to ./trained_models/model_trim-sweep-6.pt


0,1
epoch,▁▃▅▆█
loss,█▄▂▁▁
train_accuracy,▁▅▆▇█
val_accuracy,▁▄▆▇█

0,1
epoch,5.0
loss,1942.26477
train_accuracy,0.6489
val_accuracy,0.60989


[34m[1mwandb[0m: Agent Starting Run: frkzwuqy with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	embed_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.0005
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: adamw


Epoch 1/10, Loss: 4840.6998, Train Acc: 0.3477, Val Acc: 0.3419
Epoch 2/10, Loss: 2782.0607, Train Acc: 0.5811, Val Acc: 0.5608
Epoch 3/10, Loss: 1866.0788, Train Acc: 0.7014, Val Acc: 0.6629
Epoch 4/10, Loss: 1473.4992, Train Acc: 0.7572, Val Acc: 0.7071
Epoch 5/10, Loss: 1254.7539, Train Acc: 0.7937, Val Acc: 0.7345
Epoch 6/10, Loss: 1112.7068, Train Acc: 0.8143, Val Acc: 0.7457
Epoch 7/10, Loss: 1008.2076, Train Acc: 0.8362, Val Acc: 0.7728
Epoch 8/10, Loss: 927.8417, Train Acc: 0.8413, Val Acc: 0.7703
Epoch 9/10, Loss: 863.1061, Train Acc: 0.8548, Val Acc: 0.7767
Epoch 10/10, Loss: 808.9390, Train Acc: 0.8648, Val Acc: 0.7816
Model saved to ./trained_models/model_honest-sweep-7.pt


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▄▃▂▂▂▁▁▁▁
train_accuracy,▁▄▆▇▇▇████
val_accuracy,▁▄▆▇▇▇████

0,1
epoch,10.0
loss,808.93899
train_accuracy,0.86479
val_accuracy,0.78159


[34m[1mwandb[0m: Agent Starting Run: w59gx9zd with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: adamw


Epoch 1/10, Loss: 5410.3484, Train Acc: 0.1564, Val Acc: 0.1530
Epoch 2/10, Loss: 5184.6143, Train Acc: 0.1466, Val Acc: 0.1511
Epoch 3/10, Loss: 5131.9613, Train Acc: 0.1744, Val Acc: 0.1712
Epoch 4/10, Loss: 5031.5812, Train Acc: 0.1813, Val Acc: 0.1736
Epoch 5/10, Loss: 4985.3394, Train Acc: 0.1797, Val Acc: 0.1696
Epoch 6/10, Loss: 4967.6654, Train Acc: 0.1864, Val Acc: 0.1794
Epoch 7/10, Loss: 4948.1991, Train Acc: 0.1911, Val Acc: 0.1805
Epoch 8/10, Loss: 4907.8394, Train Acc: 0.1985, Val Acc: 0.1925
Epoch 9/10, Loss: 4878.5995, Train Acc: 0.1798, Val Acc: 0.1734
Epoch 10/10, Loss: 4887.3298, Train Acc: 0.1682, Val Acc: 0.1627
Model saved to ./trained_models/model_morning-sweep-8.pt


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▅▄▃▂▂▂▁▁▁
train_accuracy,▂▁▅▆▅▆▇█▅▄
val_accuracy,▁▁▄▅▄▆▆█▅▃

0,1
epoch,10.0
loss,4887.32977
train_accuracy,0.16822
val_accuracy,0.16272


[34m[1mwandb[0m: Agent Starting Run: 2ywg20ia with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	embed_size: 128
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.0005
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: adam


Epoch 1/5, Loss: 4292.6543, Train Acc: 0.4355, Val Acc: 0.4256
Epoch 2/5, Loss: 2276.1394, Train Acc: 0.6457, Val Acc: 0.6132
Epoch 3/5, Loss: 1620.9036, Train Acc: 0.7297, Val Acc: 0.6838
Epoch 4/5, Loss: 1323.9823, Train Acc: 0.7811, Val Acc: 0.7250
Epoch 5/5, Loss: 1148.9194, Train Acc: 0.8047, Val Acc: 0.7406
Model saved to ./trained_models/model_revived-sweep-9.pt


0,1
epoch,▁▃▅▆█
loss,█▄▂▁▁
train_accuracy,▁▅▇██
val_accuracy,▁▅▇██

0,1
epoch,5.0
loss,1148.9194
train_accuracy,0.80475
val_accuracy,0.74059


[34m[1mwandb[0m: Agent Starting Run: 3bs7bqxn with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	embed_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: sgd


[34m[1mwandb[0m: [32m[41mERROR[0m Run 3bs7bqxn errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/mnt/e_disk/ch24s016/da6401_assignment3/.venv/lib/python3.10/site-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_3752887/2263630927.py", line 70, in train_sweep
[34m[1mwandb[0m: [32m[41mERROR[0m     raise ValueError("Unsupported optimizer")
[34m[1mwandb[0m: [32m[41mERROR[0m ValueError: Unsupported optimizer
[34m[1mwandb[0m: [32m[41mERROR[0m 


Question 4 (10 Marks)
You will now apply your best model on the test data (You shouldn't have used test data so far. All the above experiments should have been done using train and val data only).

(a) Use the best model from your sweep and report the accuracy on the test set (the output is correct only if it exactly matches the reference output).

(b) Provide sample inputs from the test data and predictions made by your best model (more marks for presenting this grid creatively). Also upload all the predictions on the test set in a folder predictions_vanilla on your github project.

(c) Comment on the errors made by your model (simple insightful bullet points)

The model makes more errors on consonants than vowels
The model makes more errors on longer sequences
I am thinking confusion matrix but may be it's just me!

### the best model parameters that we got so far is 

In [20]:
wandb.init(project="Seq2SeqAssignment3",entity="ch24s016-iitm",name='bestsweepsofar')
api = wandb.Api()

# Fetch all runs in the sweep
sweep_runs = api.sweep(f"ch24s016-iitm/Seq2SeqAssignment3/gy0rcmqo/").runs


# Find the best model based on validation accuracy
best_run = max(sweep_runs, key=lambda run: run.summary.get("val_accuracy", 0))
print(best_run)
print(best_run.config)

<Run ch24s016-iitm/Seq2SeqAssignment3/frkzwuqy (finished)>
{'lr': 0.0005, 'epochs': 10, 'cell_type': 'LSTM', 'optimizer': 'adamw', 'batch_size': 32, 'embed_size': 32, 'num_layers': 1, 'hidden_size': 128}


we are going to use this model and generate the test outputs 

In [28]:
train_path = "/mnt/e_disk/ch24s016/da6401_assignment3/dataset/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv"
dev_path = "/mnt/e_disk/ch24s016/da6401_assignment3/dataset/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.dev.tsv"

char_set = build_vocab([train_path, dev_path])
roman2idx, idx2roman = make_char2idx(set(c for c in char_set if c.isascii()))
devanagari2idx, idx2devanagari = make_char2idx(set(c for c in char_set if not c.isascii()))


test_path = "/mnt/e_disk/ch24s016/da6401_assignment3/dataset/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.test.tsv"

test_dataset = TransliterationDataset(test_path, roman2idx, devanagari2idx, max_len=32)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_batch)


checkpoint = torch.load("/mnt/e_disk/ch24s016/da6401_assignment3/trained_models/model_honest-sweep-7.pt", map_location=device)

# print(checkpoint)?
embed_size = 32
hidden_size = 128
num_layers = 1
cell_type = 'LSTM'

src_vocab_size = len(roman2idx)
tgt_vocab_size = len(devanagari2idx)

encoder = Encoder(src_vocab_size, embed_size, hidden_size, num_layers, cell_type).to(device)
decoder = Decoder(tgt_vocab_size, embed_size, hidden_size, num_layers, cell_type).to(device)
model = Seq2Seq(encoder, decoder, cell_type).to(device)

# Load state dict directly from checkpoint dict
model.load_state_dict(checkpoint)
model.eval()

def predict(model, input_str, roman2idx, idx2devanagari, max_len=32):
    model.eval()

    # Convert input string to index tensor
    input_idxs = [roman2idx.get(c, roman2idx["<unk>"]) for c in input_str]
    input_tensor = torch.tensor(input_idxs, dtype=torch.long).unsqueeze(0).to(device)  # (1, seq_len)

    with torch.no_grad():
        hidden = model.encoder(input_tensor)

        # Start with <sos>
        input_dec = torch.tensor([[devanagari2idx["<sos>"]]], dtype=torch.long).to(device)

        output_tokens = []
        for _ in range(max_len):
            output, hidden = model.decoder(input_dec, hidden)
            top1 = output.argmax(1).item()

            if top1 == devanagari2idx["<eos>"]:
                break

            output_tokens.append(top1)
            input_dec = torch.tensor([[top1]], dtype=torch.long).to(device)

    # Convert indices back to characters
    return ''.join([idx2devanagari[i] for i in output_tokens])

# --- Transliterate test set and write predictions ---
output_file = "prediction_attention.tsv"
with open(test_path, 'r', encoding='utf-8') as f_in, open(output_file, 'w', encoding='utf-8') as f_out:
    for line in f_in:
        parts = line.strip().split('\t')
        if len(parts) < 2:
            continue
        roman = parts[1]
        src_idxs = [roman2idx.get(c, roman2idx["<unk>"]) for c in roman]
        pred = predict(model, roman, roman2idx, idx2devanagari)
        f_out.write(f"{roman}\t{pred}\n")

print(f"Predictions written to {output_file}")



Predictions written to prediction_attention.tsv
