In [12]:
import wandb
wandb.login(key="fb4c8007ed0d1fb692b2279b11bb69081f2c698d")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Import Libraries

In [14]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import wandb
from tqdm import tqdm

In [15]:
# Dataset utilities
class TransliterationDataset(Dataset):
    def __init__(self, pairs, input_vocab, output_vocab):
        self.pairs = pairs
        self.input_vocab = input_vocab
        self.output_vocab = output_vocab
        self.sos = output_vocab['<sos>']
        self.eos = output_vocab['<eos>']

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        source, target = self.pairs[idx]
        input_ids = [self.input_vocab[c] for c in source]
        target_ids = [self.sos] + [self.output_vocab[c] for c in target] + [self.eos]
        return torch.tensor(input_ids), torch.tensor(target_ids)

In [16]:
def build_vocab(pairs):
    input_chars = set()
    output_chars = set()
    for source, target in pairs:
        input_chars.update(source)
        output_chars.update(target)
    input_vocab = {c: i + 1 for i, c in enumerate(sorted(input_chars))}
    input_vocab['<pad>'] = 0
    output_vocab = {c: i + 3 for i, c in enumerate(sorted(output_chars))}
    output_vocab.update({'<pad>': 0, '<sos>': 1, '<eos>': 2})
    return input_vocab, output_vocab

def load_pairs(path):
    df = pd.read_csv(path, sep="\t", header=None, names=["target", "source", "count"], dtype=str)
    df.dropna(subset=["source", "target"], inplace=True)
    return list(zip(df["source"], df["target"]))

def collate_fn(batch):
    inputs, targets = zip(*batch)
    input_lens = [len(seq) for seq in inputs]
    target_lens = [len(seq) for seq in targets]
    inputs_padded = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)
    return inputs_padded, targets_padded, input_lens, target_lens

class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_layers, cell_type, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_size, embed_size, padding_idx=0)
        rnn_class = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[cell_type]
        self.rnn = rnn_class(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)

    def forward(self, x, lengths):
        x = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_size, embed_size, hidden_size, num_layers, cell_type, dropout):
        super().__init__()
        self.embedding = nn.Embedding(output_size, embed_size, padding_idx=0)
        rnn_class = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[cell_type]
        self.rnn = rnn_class(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input_token, hidden):
        x = self.embedding(input_token.unsqueeze(1))
        output, hidden = self.rnn(x, hidden)
        output = self.fc(output.squeeze(1))
        return output, hidden

    def beam_search(self, hidden, max_len, sos_idx, eos_idx, beam_size=3):
        device = next(self.parameters()).device
        sequences = [[torch.tensor([sos_idx], device=device), hidden, 0.0]]
        completed = []

        for _ in range(max_len):
            new_sequences = []
            for seq, h, score in sequences:
                input_token = seq[-1].unsqueeze(0)
                output, new_hidden = self.forward(input_token, h)
                probs = torch.log_softmax(output, dim=-1).squeeze(0)
                topk_probs, topk_indices = probs.topk(beam_size)
                for i in range(beam_size):
                    next_token = topk_indices[i].item()
                    new_score = score + topk_probs[i].item()
                    new_seq = torch.cat([seq, torch.tensor([next_token], device=device)])
                    new_sequences.append([new_seq, new_hidden, new_score])
            sequences = sorted(new_sequences, key=lambda x: x[2], reverse=True)[:beam_size]
            completed.extend([seq for seq in sequences if seq[0][-1].item() == eos_idx])
            sequences = [seq for seq in sequences if seq[0][-1].item() != eos_idx]
            if not sequences:
                break
        completed = sorted(completed, key=lambda x: x[2], reverse=True)
        return completed[0][0] if completed else sequences[0][0]

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, src_lens, tgt=None, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        device = src.device
        hidden = self.encoder(src, src_lens)
        if tgt is not None:
            tgt_len = tgt.size(1)
            outputs = torch.zeros(batch_size, tgt_len, self.decoder.fc.out_features, device=device)
            input_token = tgt[:, 0]
            for t in range(1, tgt_len):
                output, hidden = self.decoder(input_token, hidden)
                outputs[:, t] = output
                teacher_force = torch.rand(1).item() < teacher_forcing_ratio
                input_token = tgt[:, t] if teacher_force else output.argmax(1)
            return outputs
        else:
            return [self.decoder.beam_search(hidden, max_len=20, sos_idx=1, eos_idx=2) for _ in range(batch_size)]

def accuracy(preds, targets, pad_idx=0):
    pred_tokens = preds.argmax(dim=-1)
    correct = ((pred_tokens == targets) & (targets != pad_idx)).sum().item()
    total = (targets != pad_idx).sum().item()
    return correct / total if total > 0 else 0.0

def train(model, loader, optimizer, criterion, device):
    model.train()
    total_loss, total_acc = 0, 0
    for src, tgt, src_lens, tgt_lens in tqdm(loader, desc="Training", leave=False):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, src_lens, tgt)
        loss = criterion(output[:, 1:].reshape(-1, output.size(-1)), tgt[:, 1:].reshape(-1))
        acc = accuracy(output[:, 1:], tgt[:, 1:])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_acc += acc
    return total_loss / len(loader), total_acc / len(loader)

@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_acc = 0, 0
    for src, tgt, src_lens, tgt_lens in tqdm(loader, desc="Evaluating", leave=False):
        src, tgt = src.to(device), tgt.to(device)
        output = model(src, src_lens, tgt, teacher_forcing_ratio=0.0)
        loss = criterion(output[:, 1:].reshape(-1, output.size(-1)), tgt[:, 1:].reshape(-1))
        acc = accuracy(output[:, 1:], tgt[:, 1:])
        total_loss += loss.item()
        total_acc += acc
    return total_loss / len(loader), total_acc / len(loader)

def main():
    import wandb
    # Run name will be assigned after wandb.init with config
    def generate_run_name(config):
        return f"cell:{config.cell_type}_embed:{config.embed_size}_hid:{config.hidden_size}_layers:{config.num_layers}_beam:{config.beam_size}"

    # First initialize W&B run with placeholder name
    wandb.init(project="Dakshina-Translitration", config=wandb.config)
    config = wandb.config


    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv")
    dev_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv")

    input_vocab, output_vocab = build_vocab(train_pairs)
    train_dataset = TransliterationDataset(train_pairs, input_vocab, output_vocab)
    dev_dataset = TransliterationDataset(dev_pairs, input_vocab, output_vocab)

    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn)

    encoder = Encoder(len(input_vocab), config.embed_size, config.hidden_size, config.num_layers, config.cell_type, config.dropout)
    decoder = Decoder(len(output_vocab), config.embed_size, config.hidden_size, config.num_layers, config.cell_type, config.dropout)
    model = Seq2Seq(encoder, decoder).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    for epoch in range(10):
        train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
        val_loss, val_acc = evaluate(model, dev_loader, criterion, device)
        wandb.log({
            "epoch": epoch,
            "train_loss": train_loss,
            "train_accuracy": train_acc,
            "val_loss": val_loss,
            "val_accuracy": val_acc
        })


if __name__ == "__main__":
    sweep_config = {
        "method": "bayes",
        "metric": {"name": "val_accuracy", "goal": "maximize"},
        "parameters": {
            "embed_size": {"values": [32, 64, 128]},
            "hidden_size": {"values": [64, 128, 256]},
            "num_layers": {"values": [1,2,3]},
            "cell_type": {"values": ["RNN", "GRU", "LSTM"]},
            "dropout": {"values": [0.1,0.2, 0.3]},
            "lr": {"min": 0.0001, "max": 0.01},
            "batch_size": {"values": [16,32, 64]},
            "beam_size": {"values": [1, 3, 5]}  
        }
    }

    sweep_id = wandb.sweep(sweep_config, project="Dakshina-Translitration")
    wandb.agent(sweep_id, function=main, count=8)

Create sweep with ID: gycjj862
Sweep URL: https://wandb.ai/ma23c014-indian-institute-of-technology-madras/Dakshina-Translitration/sweeps/gycjj862


[34m[1mwandb[0m: Agent Starting Run: 8zjzrrw8 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	embed_size: 32
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.001922051055737968
[34m[1mwandb[0m: 	num_layers: 1


                                                              

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▃▅▆▇▇████
train_loss,█▅▄▃▂▂▁▁▁▁
val_accuracy,▁▂▄▅▇█▇▇▇█
val_loss,█▆▅▃▂▂▂▂▁▁

0,1
epoch,9.0
train_accuracy,0.42475
train_loss,2.06822
val_accuracy,0.36221
val_loss,2.3299


[34m[1mwandb[0m: Agent Starting Run: fx2h5q07 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_size: 32
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.009732487749796489
[34m[1mwandb[0m: 	num_layers: 3


                                                              

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▅▆▇▇▇████
train_loss,█▄▃▂▂▁▁▁▁▁
val_accuracy,▁▄▆▆▇▇▇███
val_loss,█▅▃▃▂▂▂▂▁▁

0,1
epoch,9.0
train_accuracy,0.66975
train_loss,1.08515
val_accuracy,0.63397
val_loss,1.22073


[34m[1mwandb[0m: Agent Starting Run: cn34i4ff with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	embed_size: 128
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.0030315076281821774
[34m[1mwandb[0m: 	num_layers: 2


                                                              

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▅▆▇▇▇████
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁▄▆▇▇▇█▇██
val_loss,█▃▃▁▂▃▂▂▃▃

0,1
epoch,9.0
train_accuracy,0.85331
train_loss,0.48565
val_accuracy,0.69376
val_loss,1.14946


[34m[1mwandb[0m: Agent Starting Run: h2j00wf2 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.009383126656218993
[34m[1mwandb[0m: 	num_layers: 2


                                                              

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇▇▇█▇██
train_loss,█▃▂▂▂▁▁▂▁▁
val_accuracy,▁▃▄▄▄▆▇▇█▅
val_loss,█▇▆▃▅▆▃▃▁▃

0,1
epoch,9.0
train_accuracy,0.56472
train_loss,1.39958
val_accuracy,0.50193
val_loss,1.6292


[34m[1mwandb[0m: Agent Starting Run: x4ml5nq5 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.006911221668891748
[34m[1mwandb[0m: 	num_layers: 3


                                                              

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▃▄▃▄▆▇█▆▄
train_loss,█▆▄▅▄▂▂▁▃▄
val_accuracy,▂▁▁▃▃▃▇█▂▄
val_loss,▆▆█▅▅▅▁▁▄▃

0,1
epoch,9.0
train_accuracy,0.2508
train_loss,2.82917
val_accuracy,0.23053
val_loss,2.97825


[34m[1mwandb[0m: Agent Starting Run: vm2ffuwh with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_size: 128
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	lr: 0.009947560970155997
[34m[1mwandb[0m: 	num_layers: 3


                                                              

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▇▇▇▇▇██▇█
train_loss,█▂▁▂▁▁▁▁▂▁
val_accuracy,▁▄▅▃▃▄█▂▅█
val_loss,▇█▄▇▃▃▃▅▄▁

0,1
epoch,9.0
train_accuracy,0.48319
train_loss,1.70215
val_accuracy,0.47558
val_loss,1.70191


[34m[1mwandb[0m: Agent Starting Run: u9f4zpds with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	embed_size: 32
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	lr: 0.005021942698844929
[34m[1mwandb[0m: 	num_layers: 2


                                                              

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_accuracy,▁▄▆▅█▇▆▇▇▇
val_loss,█▂▃▄▁▇▅▇▆▇

0,1
epoch,9.0
train_accuracy,0.86082
train_loss,0.4583
val_accuracy,0.69897
val_loss,1.16896


[34m[1mwandb[0m: Agent Starting Run: 4kcg1153 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	embed_size: 128
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	lr: 0.001518247469670356
[34m[1mwandb[0m: 	num_layers: 3


                                                              

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▅▆▇▇▇████
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁▄▇▇███▇▇▇
val_loss,▆▃▁▂▃▄▅▆▇█

0,1
epoch,9.0
train_accuracy,0.92089
train_loss,0.26191
val_accuracy,0.71718
val_loss,1.21477


## Test Data

In [17]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import csv

# ---------------- Dataset & Utils ----------------
class TransliterationDataset(Dataset):
    def __init__(self, pairs, input_vocab, output_vocab):
        self.pairs = pairs
        self.input_vocab = input_vocab
        self.output_vocab = output_vocab
        self.sos = output_vocab['<sos>']
        self.eos = output_vocab['<eos>']

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        source, target = self.pairs[idx]
        input_ids = [self.input_vocab[c] for c in source]
        target_ids = [self.sos] + [self.output_vocab[c] for c in target] + [self.eos]
        return torch.tensor(input_ids), torch.tensor(target_ids)

def load_pairs(path):
    df = pd.read_csv(path, sep='\t', header=None, names=['target', 'source', 'count'], dtype=str)
    df.dropna(subset=["source", "target"], inplace=True)
    return list(zip(df['source'], df['target']))

def build_vocab(pairs):
    input_chars = set()
    output_chars = set()
    for src, tgt in pairs:
        input_chars.update(src)
        output_chars.update(tgt)
    input_vocab = {c: i+1 for i, c in enumerate(sorted(input_chars))}
    input_vocab['<pad>'] = 0
    output_vocab = {c: i+3 for i, c in enumerate(sorted(output_chars))}
    output_vocab.update({'<pad>': 0, '<sos>': 1, '<eos>': 2})
    return input_vocab, output_vocab

def collate_fn(batch):
    inputs, targets = zip(*batch)
    input_lens = [len(x) for x in inputs]
    target_lens = [len(x) for x in targets]
    inputs_padded = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)
    return inputs_padded, targets_padded, input_lens, target_lens

# ---------------- Models ----------------
class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_layers, cell_type, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_size, embed_size, padding_idx=0)
        rnn_cls = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[cell_type]
        self.rnn = rnn_cls(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_size, embed_size, hidden_size, num_layers, cell_type, dropout):
        super().__init__()
        self.embedding = nn.Embedding(output_size, embed_size, padding_idx=0)
        rnn_cls = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[cell_type]
        self.rnn = rnn_cls(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, token, hidden):
        x = self.embedding(token.unsqueeze(1))
        output, hidden = self.rnn(x, hidden)
        output = self.fc(output.squeeze(1))
        return output, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, src_lens, tgt=None, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        hidden = self.encoder(src, src_lens)
        tgt_len = tgt.size(1)
        outputs = torch.zeros(batch_size, tgt_len, self.decoder.fc.out_features).to(src.device)
        input_token = tgt[:, 0]
        for t in range(1, tgt_len):
            output, hidden = self.decoder(input_token, hidden)
            outputs[:, t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            input_token = tgt[:, t] if teacher_force else output.argmax(1)
        return outputs

# ---------------- Train + Eval ----------------
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src, tgt, src_lens, _ in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, src_lens, tgt)
        loss = criterion(output[:, 1:].reshape(-1, output.shape[-1]), tgt[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_and_save(model, dataloader, input_vocab, output_vocab, device, csv_path=None):
    model.eval()
    inv_input_vocab = {v: k for k, v in input_vocab.items()}
    inv_output_vocab = {v: k for k, v in output_vocab.items()}
    correct = 0
    total = 0
    results = []

    with torch.no_grad():
        for src, tgt, src_lens, _ in dataloader:
            src = src.to(device)
            hidden = model.encoder(src, src_lens)
            input_token = torch.tensor([output_vocab['<sos>']] * src.size(0)).to(device)
            decoded = []
            for _ in range(20):
                output, hidden = model.decoder(input_token, hidden)
                input_token = output.argmax(1)
                decoded.append(input_token)
            decoded = torch.stack(decoded, dim=1)

            for i in range(src.size(0)):
                pred = ''.join([inv_output_vocab[t.item()] for t in decoded[i] if t.item() not in [output_vocab['<eos>'], 0]])
                truth = ''.join([inv_output_vocab[t.item()] for t in tgt[i][1:-1]])
                inp = ''.join([inv_input_vocab[t.item()] for t in src[i] if t.item() != 0])
                results.append((inp, pred, truth))
                if pred == truth:
                    correct += 1
                total += 1

    acc = correct / total * 100
    print(f"\n Test Accuracy: {acc:.2f}%")
    for inp, pred, truth in results[:10]:
        print(f"{inp:<15} | Pred: {pred:<20} | Truth: {truth}")

    if csv_path is not None:
        with open(csv_path, mode='w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['Input', 'Prediction', 'GroundTruth'])
            writer.writerows(results)
        print(f"\n Predictions saved to: {csv_path}")

    return acc, results


# ---------------- Run ----------------
if __name__ == "__main__":
    config = {
        "embed_size": 128,
        "hidden_size": 256,
        "num_layers": 3,
        "cell_type": "LSTM",
        "dropout": 0.1,
        "batch_size": 32,
        "lr": 0.001518,
        "epochs": 5,
    }


    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv")
    test_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv")
    input_vocab, output_vocab = build_vocab(train_pairs)
    train_dataset = TransliterationDataset(train_pairs, input_vocab, output_vocab)
    test_dataset = TransliterationDataset(test_pairs, input_vocab, output_vocab)

    train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

    encoder = Encoder(len(input_vocab), config["embed_size"], config["hidden_size"],
                      config["num_layers"], config["cell_type"], config["dropout"])
    decoder = Decoder(len(output_vocab), config["embed_size"], config["hidden_size"],
                      config["num_layers"], config["cell_type"], config["dropout"])
    model = Seq2Seq(encoder, decoder).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    best_acc = 0
    for epoch in range(config["epochs"]):
        train_loss = train_model(model, train_loader, optimizer, criterion, device)
        print(f"Epoch {epoch+1} Train Loss: {train_loss:.4f}")
        acc, results = evaluate_and_save(model, test_loader, input_vocab, output_vocab, device, csv_path=None)
        if acc > best_acc:
            best_acc = acc
            torch.save(model.state_dict(), "best_model.pth")

    print("\n Loading best model for final evaluation...")
    model.load_state_dict(torch.load("best_model.pth"))

    # Save predictions CSV here
    evaluate_and_save(model, test_loader, input_vocab, output_vocab, device, csv_path="test_predictions.csv")

Epoch 1 Train Loss: 1.6284

 Test Accuracy: 24.19%
ank             | Pred: आंक                  | Truth: अंक
anka            | Pred: अंका                 | Truth: अंक
ankit           | Pred: अंकित                | Truth: अंकित
anakon          | Pred: अनाकों               | Truth: अंकों
ankhon          | Pred: अंखों                | Truth: अंकों
ankon           | Pred: अंकों                | Truth: अंकों
angkor          | Pred: अंगकर                | Truth: अंकोर
ankor           | Pred: अंकोर                | Truth: अंकोर
angaarak        | Pred: अंगारक               | Truth: अंगारक
angarak         | Pred: अंगरक                | Truth: अंगारक
Epoch 2 Train Loss: 0.7540

 Test Accuracy: 32.61%
ank             | Pred: आंक                  | Truth: अंक
anka            | Pred: अंका                 | Truth: अंक
ankit           | Pred: अंकित                | Truth: अंकित
anakon          | Pred: अनकों                | Truth: अंकों
ankhon          | Pred: अंखों                | Truth: अंकों
anko

#  Transformer Model

In [19]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import wandb
from tqdm import tqdm
import math
import csv
from collections import namedtuple

# ---------------- Data Processing and Utilities ----------------

class TransliterationDataset(Dataset):
    def __init__(self, pairs, input_vocab, output_vocab):
        self.pairs = pairs
        self.input_vocab = input_vocab
        self.output_vocab = output_vocab
        self.sos = output_vocab['<sos>']
        self.eos = output_vocab['<eos>']
        self.unk_in = input_vocab.get('<unk>', 1)
        self.unk_out = output_vocab.get('<unk>', 3)

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        source, target = self.pairs[idx]
        input_ids = [self.input_vocab.get(c, self.unk_in) for c in source]
        target_ids = [self.sos] + [self.output_vocab.get(c, self.unk_out) for c in target] + [self.eos]
        return torch.tensor(input_ids), torch.tensor(target_ids)

def build_vocab(pairs):
    input_chars = set()
    output_chars = set()
    for src, tgt in pairs:
        input_chars.update(src)
        output_chars.update(tgt)
    
    input_vocab = {c: i + 2 for i, c in enumerate(sorted(input_chars))}
    input_vocab['<pad>'] = 0
    input_vocab['<unk>'] = 1
    
    output_vocab = {c: i + 4 for i, c in enumerate(sorted(output_chars))}
    output_vocab.update({'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3})
    
    return input_vocab, output_vocab

def load_pairs(path):
    df = pd.read_csv(path, sep='\t', header=None, names=['target', 'source', 'count'], dtype=str)
    df.dropna(subset=["source", "target"], inplace=True)
    return list(zip(df['source'], df['target']))

def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs_padded = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)
    return inputs_padded, targets_padded

# ---------------- Transformer Specific Components ----------------

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, d_model, nhead, num_encoder_layers,
                 num_decoder_layers, dim_feedforward, dropout):
        super().__init__()
        
        self.d_model = d_model
        self.encoder_embedding = nn.Embedding(input_vocab_size, d_model, padding_idx=0)
        self.decoder_embedding = nn.Embedding(output_vocab_size, d_model, padding_idx=0)
        self.positional_encoding = PositionalEncoding(d_model, dropout)
        
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        
        self.fc_out = nn.Linear(d_model, output_vocab_size)
        self.output_vocab_size = output_vocab_size
        self.sos_idx = 1
        self.eos_idx = 2

    def forward(self, src, tgt, src_mask=None, tgt_mask=None, src_padding_mask=None, tgt_padding_mask=None):
        src_embedded = self.positional_encoding(self.encoder_embedding(src) * math.sqrt(self.d_model))
        tgt_embedded = self.positional_encoding(self.decoder_embedding(tgt) * math.sqrt(self.d_model))
        
        transformer_out = self.transformer(
            src_embedded, tgt_embedded,
            src_mask=src_mask,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_padding_mask,
            tgt_key_padding_mask=tgt_padding_mask
        )
        
        output = self.fc_out(transformer_out)
        return output

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def create_padding_mask(self, seq, pad_idx=0):
        return (seq == pad_idx)

# ---------------- Training and Evaluation Functions ----------------

def accuracy(preds, targets, pad_idx=0):
    pred_tokens = preds.argmax(dim=-1)
    correct = ((pred_tokens == targets) & (targets != pad_idx)).sum().item()
    total = (targets != pad_idx).sum().item()
    return correct / total if total > 0 else 0.0

@torch.no_grad()
def evaluate_word_accuracy(model, dataloader, device, output_vocab):
    model.eval()
    correct_words = 0
    total_words = 0
    
    for src, tgt in tqdm(dataloader, desc="Evaluating"):
        src, tgt = src.to(device), tgt.to(device)

        src_padding_mask = model.create_padding_mask(src).to(device)
        batch_size = src.size(0)
        max_len = 20
        
        # Inference loop for the decoder
        generated_tokens = torch.full((batch_size, 1), model.sos_idx, dtype=torch.long, device=device)
        
        for t in range(max_len):
            tgt_mask = model.generate_square_subsequent_mask(generated_tokens.size(1)).to(device)
            tgt_padding_mask = model.create_padding_mask(generated_tokens).to(device)
            
            output = model(src, generated_tokens, src_padding_mask=src_padding_mask, tgt_mask=tgt_mask, tgt_padding_mask=tgt_padding_mask)
            
            next_token = output[:, -1, :].argmax(dim=-1).unsqueeze(1)
            generated_tokens = torch.cat([generated_tokens, next_token], dim=1)
            
            if (next_token == model.eos_idx).all():
                break

        for i in range(batch_size):
            pred_seq = generated_tokens[i]
            target_seq = tgt[i]
            
            pred_end = (pred_seq == model.eos_idx).nonzero(as_tuple=True)[0]
            target_end = (target_seq == model.eos_idx).nonzero(as_tuple=True)[0]
            
            pred_word = pred_seq[1:pred_end[0] if pred_end.numel() > 0 else len(pred_seq)]
            target_word = target_seq[1:target_end[0] if target_end.numel() > 0 else len(target_seq)]

            if torch.equal(pred_word, target_word):
                correct_words += 1
            total_words += 1
            
    return correct_words / total_words if total_words > 0 else 0.0

def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss, total_acc = 0, 0
    for src, tgt in tqdm(loader, desc="Training", leave=False):
        src, tgt = src.to(device), tgt.to(device)

        optimizer.zero_grad()
        
        src_padding_mask = model.create_padding_mask(src).to(device)
        tgt_padding_mask = model.create_padding_mask(tgt).to(device)
        
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).to(device)
        
        output = model(src, tgt_input, src_padding_mask=src_padding_mask, tgt_padding_mask=tgt_padding_mask, tgt_mask=tgt_mask)
        
        loss = criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))
        acc = accuracy(output, tgt_output)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_acc += acc
        
    return total_loss / len(loader), total_acc / len(loader)

# ---------------- Main Function for W&B Sweep ----------------

def main():
    import wandb
    
    def generate_run_name(config):
        return f"transformer_d:{config.d_model}_nhead:{config.nhead}_layers:{config.num_encoder_layers}"

    wandb.init(project="Dakshina-Translitration-Transformer", config=wandb.config)
    config = wandb.config
    wandb.run.name = generate_run_name(config)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv")
    dev_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv")

    # Build vocab on train + dev pairs for consistency
    input_vocab, output_vocab = build_vocab(train_pairs + dev_pairs)
    train_dataset = TransliterationDataset(train_pairs, input_vocab, output_vocab)
    dev_dataset = TransliterationDataset(dev_pairs, input_vocab, output_vocab)

    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn)

    model = TransformerModel(
        input_vocab_size=len(input_vocab),
        output_vocab_size=len(output_vocab),
        d_model=config.d_model,
        nhead=config.nhead,
        num_encoder_layers=config.num_encoder_layers,
        num_decoder_layers=config.num_decoder_layers,
        dim_feedforward=config.dim_feedforward,
        dropout=config.dropout
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, betas=(0.9, 0.98), eps=1e-9)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    best_dev_acc = 0
    for epoch in range(10):
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)
        dev_acc = evaluate_word_accuracy(model, dev_loader, device, output_vocab)
        
        if dev_acc > best_dev_acc:
            best_dev_acc = dev_acc
            torch.save(model.state_dict(), 'best_transformer_model.pth')
            print(f"Epoch {epoch+1} - New best model saved with dev word accuracy: {best_dev_acc:.4f}")

        wandb.log({
            "epoch": epoch,
            "train_loss": train_loss,
            "train_accuracy": train_acc,
            "dev_word_accuracy": dev_acc
        })

if __name__ == "__main__":
    sweep_config = {
        "method": "bayes",
        "metric": {"name": "dev_word_accuracy", "goal": "maximize"},
        "parameters": {
            "d_model": {"values": [128, 256, 512]},
            "nhead": {"values": [4, 8, 16]},
            "num_encoder_layers": {"values": [2, 4]},
            "num_decoder_layers": {"values": [2, 4]},
            "dim_feedforward": {"values": [512, 1024, 2048]},
            "dropout": {"values": [0.1, 0.2, 0.3]},
            "lr": {"min": 0.0001, "max": 0.001},
            "batch_size": {"values": [16, 32, 64]}
        }
    }
    
    sweep_id = wandb.sweep(sweep_config, project="Dakshina-Translitration-Transformer")
    wandb.agent(sweep_id, function=main, count=8)

Create sweep with ID: f7bfyaft
Sweep URL: https://wandb.ai/ma23c014-indian-institute-of-technology-madras/Dakshina-Translitration-Transformer/sweeps/f7bfyaft


[34m[1mwandb[0m: Agent Starting Run: drxj6rvz with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 256
[34m[1mwandb[0m: 	dim_feedforward: 2048
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	lr: 0.0006462285506752879
[34m[1mwandb[0m: 	nhead: 16
[34m[1mwandb[0m: 	num_decoder_layers: 4
[34m[1mwandb[0m: 	num_encoder_layers: 2


                                                 

[34m[1mwandb[0m: [32m[41mERROR[0m Run drxj6rvz errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 302, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_36/2098532379.py", line 248, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)
[34m[1mwandb[0m: [32m[41mERROR[0m                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_36/2098532379.py", line 194, in train_epoch
[34m[1mwandb[0m: [32m[41mERROR[0m     output = model(src, tgt_input, src_padding_mask=src_padding_mask, tgt_padding_mask=tgt_padding_mask, tgt_mask=tgt_mask)
[34m[1mwandb[0m: [32m[41mERR

                                                 

[34m[1mwandb[0m: [32m[41mERROR[0m Run qzdp52xb errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 302, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_36/2098532379.py", line 248, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)
[34m[1mwandb[0m: [32m[41mERROR[0m                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_36/2098532379.py", line 194, in train_epoch
[34m[1mwandb[0m: [32m[41mERROR[0m     output = model(src, tgt_input, src_padding_mask=src_padding_mask, tgt_padding_mask=tgt_padding_mask, tgt_mask=tgt_mask)
[34m[1mwandb[0m: [32m[41mERR

                                                  

[34m[1mwandb[0m: [32m[41mERROR[0m Run pc59y6ge errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 302, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_36/2098532379.py", line 248, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)
[34m[1mwandb[0m: [32m[41mERROR[0m                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_36/2098532379.py", line 194, in train_epoch
[34m[1mwandb[0m: [32m[41mERROR[0m     output = model(src, tgt_input, src_padding_mask=src_padding_mask, tgt_padding_mask=tgt_padding_mask, tgt_mask=tgt_mask)
[34m[1mwandb[0m: [32m[41mERR

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import wandb
from tqdm import tqdm
import math
import csv
from collections import namedtuple

# ---------------- Data Processing and Utilities ----------------

class TransliterationDataset(Dataset):
    def __init__(self, pairs, input_vocab, output_vocab):
        self.pairs = pairs
        self.input_vocab = input_vocab
        self.output_vocab = output_vocab
        self.sos = output_vocab['<sos>']
        self.eos = output_vocab['<eos>']
        # Robustly get unk indices, with fallbacks
        self.unk_in = input_vocab.get('<unk>', 1) 
        self.unk_out = output_vocab.get('<unk>', 3)

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        source, target = self.pairs[idx]
        # Use .get() with unk_in/unk_out for handling unseen characters
        input_ids = [self.input_vocab.get(c, self.unk_in) for c in source]
        target_ids = [self.sos] + [self.output_vocab.get(c, self.unk_out) for c in target] + [self.eos]
        return torch.tensor(input_ids), torch.tensor(target_ids)

def build_vocab(pairs):
    input_chars = set()
    output_chars = set()
    for src, tgt in pairs:
        input_chars.update(src)
        output_chars.update(tgt)
    
    # Vocab indexing: <pad>:0, <unk>:1, then sorted chars
    input_vocab = {c: i + 2 for i, c in enumerate(sorted(input_chars))}
    input_vocab['<pad>'] = 0
    input_vocab['<unk>'] = 1
    
    # Vocab indexing: <pad>:0, <sos>:1, <eos>:2, <unk>:3, then sorted chars
    output_vocab = {c: i + 4 for i, c in enumerate(sorted(output_chars))}
    output_vocab.update({'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3})
    
    return input_vocab, output_vocab

def load_pairs(path):
    # Ensure the path is correct for your environment (e.g., Kaggle, local, Colab)
    # Common issue: FileNotFoundError if path is wrong.
    df = pd.read_csv(path, sep='\t', header=None, names=['target', 'source', 'count'], dtype=str)
    df.dropna(subset=["source", "target"], inplace=True)
    return list(zip(df['source'], df['target']))

def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs_padded = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)
    return inputs_padded, targets_padded

# ---------------- Transformer Specific Components ----------------

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0) # Add batch dimension
        self.register_buffer('pe', pe)

    def forward(self, x):
        # Add positional encoding to input embeddings
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, d_model, nhead, num_encoder_layers,
                 num_decoder_layers, dim_feedforward, dropout):
        super().__init__()
        
        self.d_model = d_model
        self.encoder_embedding = nn.Embedding(input_vocab_size, d_model, padding_idx=0)
        self.decoder_embedding = nn.Embedding(output_vocab_size, d_model, padding_idx=0)
        self.positional_encoding = PositionalEncoding(d_model, dropout)
        
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True # Important: Use batch_first for convenience
        )
        
        self.fc_out = nn.Linear(d_model, output_vocab_size)
        self.output_vocab_size = output_vocab_size
        self.sos_idx = 1
        self.eos_idx = 2

    def forward(self, src, tgt, src_mask=None, tgt_mask=None, src_padding_mask=None, tgt_padding_mask=None):
        # Embed and add positional encoding
        src_embedded = self.positional_encoding(self.encoder_embedding(src) * math.sqrt(self.d_model))
        tgt_embedded = self.positional_encoding(self.decoder_embedding(tgt) * math.sqrt(self.d_model))
        
        # Pass through Transformer layers
        transformer_out = self.transformer(
            src_embedded, tgt_embedded,
            src_mask=src_mask,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_padding_mask,
            tgt_key_padding_mask=tgt_padding_mask
        )
        
        # Linear layer to get vocabulary logits
        output = self.fc_out(transformer_out)
        return output

    def generate_square_subsequent_mask(self, sz):
        # Generates a mask to prevent attention to future tokens in the decoder
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def create_padding_mask(self, seq, pad_idx=0):
        # Generates a boolean mask for padding tokens
        return (seq == pad_idx)

# ---------------- Training and Evaluation Functions ----------------

def accuracy(preds, targets, pad_idx=0):
    # Calculates character-level accuracy, ignoring padding
    pred_tokens = preds.argmax(dim=-1)
    correct = ((pred_tokens == targets) & (targets != pad_idx)).sum().item()
    total = (targets != pad_idx).sum().item()
    return correct / total if total > 0 else 0.0

@torch.no_grad()
def evaluate_word_accuracy(model, dataloader, device, output_vocab):
    model.eval()
    correct_words = 0
    total_words = 0
    inv_output_vocab = {v: k for k, v in output_vocab.items()}
    
    for src, tgt in tqdm(dataloader, desc="Evaluating", leave=False):
        src, tgt = src.to(device), tgt.to(device)

        src_padding_mask = model.create_padding_mask(src).to(device)
        batch_size = src.size(0)
        max_len = 20 # Max length for generated output (could be dynamically set based on input length if needed)
        
        # Initialize decoder input with <sos> tokens for greedy decoding
        generated_tokens = torch.full((batch_size, 1), model.sos_idx, dtype=torch.long, device=device)
        
        for t in range(max_len):
            # Create masks for the current generated sequence length
            tgt_mask = model.generate_square_subsequent_mask(generated_tokens.size(1)).to(device)
            tgt_padding_mask = model.create_padding_mask(generated_tokens).to(device)
            
            # Forward pass to get next token predictions
            output = model(src, generated_tokens, src_padding_mask=src_padding_mask, tgt_mask=tgt_mask, tgt_padding_mask=tgt_padding_mask)
            
            # Get the token with the highest probability
            next_token = output[:, -1, :].argmax(dim=-1).unsqueeze(1)
            
            # Append the predicted token to the generated sequence
            generated_tokens = torch.cat([generated_tokens, next_token], dim=1)
            
            # Stop if all sequences in the batch have generated the <eos> token
            if (next_token == model.eos_idx).all():
                break

        # Calculate word-level accuracy
        for i in range(batch_size):
            pred_seq = generated_tokens[i]
            target_seq = tgt[i]
            
            # Find the first <eos> token to trim the sequence (excluding <sos> and <eos> itself)
            pred_end = (pred_seq == model.eos_idx).nonzero(as_tuple=True)[0]
            target_end = (target_seq == model.eos_idx).nonzero(as_tuple=True)[0]
            
            # Extract the actual word tokens, excluding <sos> and <eos>
            pred_word = pred_seq[1:pred_end[0] if pred_end.numel() > 0 else len(pred_seq)]
            target_word = target_seq[1:target_end[0] if target_end.numel() > 0 else len(target_seq)]

            if torch.equal(pred_word, target_word):
                correct_words += 1
            total_words += 1
            
    return correct_words / total_words if total_words > 0 else 0.0

def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss, total_char_acc = 0, 0
    for src, tgt in tqdm(loader, desc="Training", leave=False):
        src, tgt = src.to(device), tgt.to(device)

        optimizer.zero_grad()
        
        src_padding_mask = model.create_padding_mask(src).to(device)
        
        tgt_input = tgt[:, :-1] # Input for decoder, excludes the last token
        tgt_output = tgt[:, 1:]  # Target for loss, excludes the first token (<sos>)

        # ************ CRITICAL FIX ************
        # Create tgt_padding_mask from tgt_input to match its length
        tgt_padding_mask = model.create_padding_mask(tgt_input).to(device)
       
        
        tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).to(device)
        
        output = model(src, tgt_input, src_padding_mask=src_padding_mask, tgt_padding_mask=tgt_padding_mask, tgt_mask=tgt_mask)
        
        # Reshape output and target for CrossEntropyLoss
        loss = criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))
        char_acc = accuracy(output, tgt_output) # Character-level accuracy
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_char_acc += char_acc
        
    return total_loss / len(loader), total_char_acc / len(loader)

def generate_predictions_csv(model, dataloader, input_vocab, output_vocab, device, csv_path):
    model.eval()
    inv_input_vocab = {v: k for k, v in input_vocab.items()}
    inv_output_vocab = {v: k for k, v in output_vocab.items()}
    results = []

    with torch.no_grad():
        for src, tgt in tqdm(dataloader, desc="Generating Test Predictions"):
            src = src.to(device)
            batch_size = src.size(0)
            max_len = 20 # Max length for generated output

            # Inference loop for the decoder (similar to evaluate_word_accuracy)
            generated_tokens = torch.full((batch_size, 1), model.sos_idx, dtype=torch.long, device=device)
            
            for t in range(max_len):
                tgt_mask = model.generate_square_subsequent_mask(generated_tokens.size(1)).to(device)
                tgt_padding_mask = model.create_padding_mask(generated_tokens).to(device)
                
                output = model(src, generated_tokens, src_padding_mask=src_padding_mask, tgt_mask=tgt_mask, tgt_padding_mask=tgt_padding_mask)
                
                next_token = output[:, -1, :].argmax(dim=-1).unsqueeze(1)
                generated_tokens = torch.cat([generated_tokens, next_token], dim=1)
                
                if (next_token == model.eos_idx).all():
                    break

            for i in range(batch_size):
                pred_seq = generated_tokens[i]
                target_seq = tgt[i]
                
                pred_end = (pred_seq == model.eos_idx).nonzero(as_tuple=True)[0]
                target_end = (target_seq == model.eos_idx).nonzero(as_tuple=True)[0]
                
                pred_word_tokens = pred_seq[1:pred_end[0] if pred_end.numel() > 0 else len(pred_seq)]
                # Ensure truth_word_tokens also excludes any potential padding if it's shorter than predicted length
                truth_word_tokens = target_seq[1:target_end[0] if target_end.numel() > 0 else len(target_seq)]

                pred_str = ''.join([inv_output_vocab[t.item()] for t in pred_word_tokens if t.item() not in [model.sos_idx, model.eos_idx, 0]])
                truth_str = ''.join([inv_output_vocab[t.item()] for t in truth_word_tokens if t.item() not in [model.sos_idx, model.eos_idx, 0]])
                inp_str = ''.join([inv_input_vocab[t.item()] for t in src[i] if t.item() != 0])
                results.append((inp_str, pred_str, truth_str))
    
    with open(csv_path, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Input', 'Prediction', 'GroundTruth'])
        writer.writerows(results)
    print(f"\nPredictions saved to: {csv_path}")

# ---------------- Main Function for W&B Sweep ----------------

def main():
    import wandb
    
    def generate_run_name(config):
        return f"transformer_d:{config.d_model}_nhead:{config.nhead}_layers:{config.num_encoder_layers}"

    wandb.init(project="Dakshina-Translitration-Transformer", config=wandb.config)
    config = wandb.config
    wandb.run.name = generate_run_name(config)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv")
    dev_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv")
    test_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv")
    # *******************************************************************

    # Build vocab on train + dev pairs for consistency
    input_vocab, output_vocab = build_vocab(train_pairs + dev_pairs)
    train_dataset = TransliterationDataset(train_pairs, input_vocab, output_vocab)
    dev_dataset = TransliterationDataset(dev_pairs, input_vocab, output_vocab)
    test_dataset = TransliterationDataset(test_pairs, input_vocab, output_vocab) # Prepare test dataset here

    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn) # Batch size 1 for individual prediction

    model = TransformerModel(
        input_vocab_size=len(input_vocab),
        output_vocab_size=len(output_vocab),
        d_model=config.d_model,
        nhead=config.nhead,
        num_encoder_layers=config.num_encoder_layers,
        num_decoder_layers=config.num_decoder_layers,
        dim_feedforward=config.dim_feedforward,
        dropout=config.dropout
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, betas=(0.9, 0.98), eps=1e-9)
    criterion = nn.CrossEntropyLoss(ignore_index=0) # ignore_index=0 for <pad> token

    best_dev_acc = 0
    # Training loop
    for epoch in range(10): # Looping for 10 epochs
        train_loss, train_char_acc = train_epoch(model, train_loader, optimizer, criterion, device)
        dev_word_acc = evaluate_word_accuracy(model, dev_loader, device, output_vocab)
        
        print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Train Char Acc: {train_char_acc:.4f} | Dev Word Acc: {dev_word_acc:.4f}")
        
        if dev_word_acc > best_dev_acc:
            best_dev_acc = dev_word_acc
            torch.save(model.state_dict(), 'best_transformer_model.pth')
            print(f" -> New best model saved with dev word accuracy: {best_dev_acc:.4f}")

        wandb.log({
            "epoch": epoch,
            "train_loss": train_loss,
            "train_char_accuracy": train_char_acc,
            "dev_word_accuracy": dev_word_acc
        })

    print("\nTraining complete. Loading best model for final evaluation on test set...")
    # Load the best model found during training
    try:
        model.load_state_dict(torch.load('best_transformer_model.pth'))
    except FileNotFoundError:
        print("Error: 'best_transformer_model.pth' not found. Ensure training completed successfully and model was saved.")
        return # Exit main if model not found

    # Final evaluation on the test set (using the best saved model)
    final_test_word_acc = evaluate_word_accuracy(model, test_loader, device, output_vocab)
    print(f"\n--- Final Test Set Evaluation Results ---")
    print(f"Word-level Accuracy on Test Set: {final_test_word_acc:.4f}")
    
    # Generate and save predictions to CSV using the best model
    generate_predictions_csv(model, test_loader, input_vocab, output_vocab, device, csv_path="test_predictions.csv")
    print("Test predictions saved to test_predictions.csv")


if __name__ == "__main__":
    # Define your W&B sweep configuration
    sweep_config = {
        "method": "bayes", # Bayesian optimization
        "metric": {"name": "dev_word_accuracy", "goal": "maximize"},
        "parameters": {
            "d_model": {"values": [128, 256, 512]},
            "nhead": {"values": [4, 8, 16]},
            "num_encoder_layers": {"values": [2, 4]},
            "num_decoder_layers": {"values": [2, 4]},
            "dim_feedforward": {"values": [512, 1024, 2048]},
            "dropout": {"values": [0.1, 0.2, 0.3]},
            "lr": {"min": 0.0001, "max": 0.001},
            "batch_size": {"values": [16, 32, 64]}
        }
    }
    
    # Initialize and run the W&B agent
    sweep_id = wandb.sweep(sweep_config, project="Dakshina-Translitration-Transformer")
    wandb.agent(sweep_id, function=main, count=8) 

Create sweep with ID: ilxyaevh
Sweep URL: https://wandb.ai/ma23c014-indian-institute-of-technology-madras/Dakshina-Translitration-Transformer/sweeps/ilxyaevh


[34m[1mwandb[0m: Agent Starting Run: cxwjmo2x with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 128
[34m[1mwandb[0m: 	dim_feedforward: 2048
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	lr: 0.0006925786628377431
[34m[1mwandb[0m: 	nhead: 8
[34m[1mwandb[0m: 	num_decoder_layers: 2
[34m[1mwandb[0m: 	num_encoder_layers: 4


  output = torch._nested_tensor_from_mask(
                                                           

Epoch 1 | Train Loss: 1.7770 | Train Char Acc: 0.4555 | Dev Word Acc: 0.0016
 -> New best model saved with dev word accuracy: 0.0016


                                                           

Epoch 2 | Train Loss: 1.3094 | Train Char Acc: 0.5634 | Dev Word Acc: 0.0092
 -> New best model saved with dev word accuracy: 0.0092


                                                           

Epoch 3 | Train Loss: 1.1540 | Train Char Acc: 0.6119 | Dev Word Acc: 0.0190
 -> New best model saved with dev word accuracy: 0.0190


                                                           

Epoch 4 | Train Loss: 1.0516 | Train Char Acc: 0.6457 | Dev Word Acc: 0.0294
 -> New best model saved with dev word accuracy: 0.0294


                                                           

Epoch 5 | Train Loss: 0.9663 | Train Char Acc: 0.6739 | Dev Word Acc: 0.0399
 -> New best model saved with dev word accuracy: 0.0399


                                                           

Epoch 6 | Train Loss: 0.8824 | Train Char Acc: 0.7017 | Dev Word Acc: 0.0441
 -> New best model saved with dev word accuracy: 0.0441


                                                           

Epoch 7 | Train Loss: 0.8021 | Train Char Acc: 0.7289 | Dev Word Acc: 0.0624
 -> New best model saved with dev word accuracy: 0.0624


                                                           

Epoch 8 | Train Loss: 0.7341 | Train Char Acc: 0.7507 | Dev Word Acc: 0.0665
 -> New best model saved with dev word accuracy: 0.0665


                                                           

Epoch 9 | Train Loss: 0.6814 | Train Char Acc: 0.7693 | Dev Word Acc: 0.0909
 -> New best model saved with dev word accuracy: 0.0909


                                                           

Epoch 10 | Train Loss: 0.6320 | Train Char Acc: 0.7855 | Dev Word Acc: 0.0808

Training complete. Loading best model for final evaluation on test set...


                                                               


--- Final Test Set Evaluation Results ---
Word-level Accuracy on Test Set: 0.0760


Generating Test Predictions:   0%|          | 0/4502 [00:00<?, ?it/s]


0,1
dev_word_accuracy,▁▂▂▃▄▄▆▆█▇
epoch,▁▂▃▃▄▅▆▆▇█
train_char_accuracy,▁▃▄▅▆▆▇▇██
train_loss,█▅▄▄▃▃▂▂▁▁

0,1
dev_word_accuracy,0.08077
epoch,9.0
train_char_accuracy,0.78549
train_loss,0.63196


[34m[1mwandb[0m: [32m[41mERROR[0m Run cxwjmo2x errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 302, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_36/3770549514.py", line 361, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     generate_predictions_csv(model, test_loader, input_vocab, output_vocab, device, csv_path="test_predictions.csv")
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_36/3770549514.py", line 254, in generate_predictions_csv
[34m[1mwandb[0m: [32m[41mERROR[0m     output = model(src, generated_tokens, src_padding_mask=src_padding_mask, tgt_mask=tgt_mask, tgt_padding_mask=tgt_padding_mask)
[34m[1mwandb[0m: [32m[41mERROR[0m                                                            ^^^^^^^^^^^^^^^^
[

                                                             

Epoch 1 | Train Loss: 2.0899 | Train Char Acc: 0.3747 | Dev Word Acc: 0.0083
 -> New best model saved with dev word accuracy: 0.0083


                                                             

Epoch 2 | Train Loss: 1.6927 | Train Char Acc: 0.4573 | Dev Word Acc: 0.0108
 -> New best model saved with dev word accuracy: 0.0108


                                                             

Epoch 3 | Train Loss: 1.5802 | Train Char Acc: 0.4862 | Dev Word Acc: 0.0147
 -> New best model saved with dev word accuracy: 0.0147


                                                             

Epoch 4 | Train Loss: 1.5191 | Train Char Acc: 0.5036 | Dev Word Acc: 0.0145


                                                             

Epoch 5 | Train Loss: 1.4771 | Train Char Acc: 0.5147 | Dev Word Acc: 0.0099


                                                             

Epoch 6 | Train Loss: 1.4439 | Train Char Acc: 0.5251 | Dev Word Acc: 0.0122


                                                             

Epoch 7 | Train Loss: 1.4197 | Train Char Acc: 0.5312 | Dev Word Acc: 0.0172
 -> New best model saved with dev word accuracy: 0.0172


                                                             

Epoch 8 | Train Loss: 1.4027 | Train Char Acc: 0.5381 | Dev Word Acc: 0.0172


                                                             

Epoch 9 | Train Loss: 1.3864 | Train Char Acc: 0.5433 | Dev Word Acc: 0.0177
 -> New best model saved with dev word accuracy: 0.0177


                                                             

Epoch 10 | Train Loss: 1.3726 | Train Char Acc: 0.5494 | Dev Word Acc: 0.0170

Training complete. Loading best model for final evaluation on test set...


                                                               


--- Final Test Set Evaluation Results ---
Word-level Accuracy on Test Set: 0.0284


Generating Test Predictions:   0%|          | 0/4502 [00:00<?, ?it/s]


0,1
dev_word_accuracy,▁▃▆▆▂▄███▇
epoch,▁▂▃▃▄▅▆▆▇█
train_char_accuracy,▁▄▅▆▇▇▇███
train_loss,█▄▃▂▂▂▁▁▁▁

0,1
dev_word_accuracy,0.01698
epoch,9.0
train_char_accuracy,0.5494
train_loss,1.37263


[34m[1mwandb[0m: [32m[41mERROR[0m Run cas7wbb8 errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 302, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_36/3770549514.py", line 361, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     generate_predictions_csv(model, test_loader, input_vocab, output_vocab, device, csv_path="test_predictions.csv")
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_36/3770549514.py", line 254, in generate_predictions_csv
[34m[1mwandb[0m: [32m[41mERROR[0m     output = model(src, generated_tokens, src_padding_mask=src_padding_mask, tgt_mask=tgt_mask, tgt_padding_mask=tgt_padding_mask)
[34m[1mwandb[0m: [32m[41mERROR[0m                                                            ^^^^^^^^^^^^^^^^
[

                                                           

Epoch 1 | Train Loss: 1.9912 | Train Char Acc: 0.4046 | Dev Word Acc: 0.0011
 -> New best model saved with dev word accuracy: 0.0011


                                                           

Epoch 2 | Train Loss: 1.4863 | Train Char Acc: 0.5148 | Dev Word Acc: 0.0034
 -> New best model saved with dev word accuracy: 0.0034


                                                           

Epoch 3 | Train Loss: 1.3464 | Train Char Acc: 0.5530 | Dev Word Acc: 0.0053
 -> New best model saved with dev word accuracy: 0.0053


                                                           

Epoch 4 | Train Loss: 1.2606 | Train Char Acc: 0.5787 | Dev Word Acc: 0.0064
 -> New best model saved with dev word accuracy: 0.0064


                                                           

Epoch 5 | Train Loss: 1.1978 | Train Char Acc: 0.5976 | Dev Word Acc: 0.0050


                                                           

Epoch 6 | Train Loss: 1.1471 | Train Char Acc: 0.6131 | Dev Word Acc: 0.0103
 -> New best model saved with dev word accuracy: 0.0103


                                                           

Epoch 7 | Train Loss: 1.1096 | Train Char Acc: 0.6248 | Dev Word Acc: 0.0147
 -> New best model saved with dev word accuracy: 0.0147


                                                           

Epoch 8 | Train Loss: 1.0693 | Train Char Acc: 0.6387 | Dev Word Acc: 0.0172
 -> New best model saved with dev word accuracy: 0.0172


                                                           

Epoch 9 | Train Loss: 1.0282 | Train Char Acc: 0.6527 | Dev Word Acc: 0.0154


                                                           

Epoch 10 | Train Loss: 0.9830 | Train Char Acc: 0.6682 | Dev Word Acc: 0.0259
 -> New best model saved with dev word accuracy: 0.0259

Training complete. Loading best model for final evaluation on test set...


                                                               


--- Final Test Set Evaluation Results ---
Word-level Accuracy on Test Set: 0.0213


Generating Test Predictions:   0%|          | 0/4502 [00:00<?, ?it/s]


0,1
dev_word_accuracy,▁▂▂▂▂▄▅▆▅█
epoch,▁▂▃▃▄▅▆▆▇█
train_char_accuracy,▁▄▅▆▆▇▇▇██
train_loss,█▄▄▃▂▂▂▂▁▁

0,1
dev_word_accuracy,0.02593
epoch,9.0
train_char_accuracy,0.6682
train_loss,0.98304


[34m[1mwandb[0m: [32m[41mERROR[0m Run s9983zfs errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 302, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_36/3770549514.py", line 361, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     generate_predictions_csv(model, test_loader, input_vocab, output_vocab, device, csv_path="test_predictions.csv")
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_36/3770549514.py", line 254, in generate_predictions_csv
[34m[1mwandb[0m: [32m[41mERROR[0m     output = model(src, generated_tokens, src_padding_mask=src_padding_mask, tgt_mask=tgt_mask, tgt_padding_mask=tgt_padding_mask)
[34m[1mwandb[0m: [32m[41mERROR[0m                                                            ^^^^^^^^^^^^^^^^
[

                                                             

Epoch 1 | Train Loss: 1.9718 | Train Char Acc: 0.3977 | Dev Word Acc: 0.0085
 -> New best model saved with dev word accuracy: 0.0085


                                                             

Epoch 2 | Train Loss: 1.5602 | Train Char Acc: 0.4907 | Dev Word Acc: 0.0133
 -> New best model saved with dev word accuracy: 0.0133


                                                             

Epoch 3 | Train Loss: 1.4514 | Train Char Acc: 0.5214 | Dev Word Acc: 0.0138
 -> New best model saved with dev word accuracy: 0.0138


Training:  28%|██▊       | 780/2763 [00:17<00:44, 44.87it/s]

## For Test data

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import pandas as pd
import csv
from collections import namedtuple
import math

# --- Copy all classes and utility functions from the training script ---
# This includes:
# - TransliterationDataset, build_vocab, load_pairs, collate_fn
# - PositionalEncoding, TransformerModel

# ----------------- (Paste all the above classes and functions here) -----------------
# For brevity, I'm not duplicating them, but you should copy them exactly as they are in the training code.
# The TransformerModel class must be exactly the same as the one used for training.

# ----------------- Evaluation Function for Final Test Set -----------------

@torch.no_grad()
def evaluate_test_set(model, test_loader, device, output_vocab):
    model.eval()
    correct_words = 0
    total_words = 0
    results = []
    inv_input_vocab = {v: k for k, v in output_vocab.items()}
    inv_output_vocab = {v: k for k, v in output_vocab.items()}
    
    for src, tgt in tqdm(test_loader, desc="Evaluating Test Set"):
        src, tgt = src.to(device), tgt.to(device)

        src_padding_mask = model.create_padding_mask(src).to(device)
        batch_size = src.size(0)
        max_len = 20
        
        generated_tokens = torch.full((batch_size, 1), model.sos_idx, dtype=torch.long, device=device)
        
        for t in range(max_len):
            tgt_mask = model.generate_square_subsequent_mask(generated_tokens.size(1)).to(device)
            tgt_padding_mask = model.create_padding_mask(generated_tokens).to(device)
            
            output = model(src, generated_tokens, src_padding_mask=src_padding_mask, tgt_mask=tgt_mask, tgt_padding_mask=tgt_padding_mask)
            
            next_token = output[:, -1, :].argmax(dim=-1).unsqueeze(1)
            generated_tokens = torch.cat([generated_tokens, next_token], dim=1)
            
            if (next_token == model.eos_idx).all():
                break

        for i in range(batch_size):
            pred_seq = generated_tokens[i]
            target_seq = tgt[i]
            
            pred_end = (pred_seq == model.eos_idx).nonzero(as_tuple=True)[0]
            target_end = (target_seq == model.eos_idx).nonzero(as_tuple=True)[0]
            
            pred_word = pred_seq[1:pred_end[0] if pred_end.numel() > 0 else len(pred_seq)]
            target_word = target_seq[1:target_end[0] if target_end.numel() > 0 else len(target_seq)]

            pred_str = ''.join([inv_output_vocab[t.item()] for t in pred_word])
            truth_str = ''.join([inv_output_vocab[t.item()] for t in target_word])
            inp_str = ''.join([inv_input_vocab[t.item()] for t in src[i] if t.item() != 0])
            results.append((inp_str, pred_str, truth_str))

            if torch.equal(pred_word, target_word):
                correct_words += 1
            total_words += 1
            
    acc = correct_words / total_words if total_words > 0 else 0.0
    
    # Save predictions to CSV
    with open("test_predictions.csv", mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Input', 'Prediction', 'GroundTruth'])
        writer.writerows(results)
    
    return acc

if __name__ == "__0": # Renamed to avoid running accidentally
    # --- Load Data and Vocabularies ---
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_pairs = load_pairs("/kaggle/input/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv")
    dev_pairs = load_pairs("/kaggle/input/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv")
    test_pairs = load_pairs("/kaggle/input/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv")

    input_vocab, output_vocab = build_vocab(train_pairs + dev_pairs)
    test_dataset = TransliterationDataset(test_pairs, input_vocab, output_vocab)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

    # --- Define Model and Load Best Weights ---
    config = namedtuple('Config', ['d_model', 'nhead', 'num_encoder_layers', 'num_decoder_layers', 'dim_feedforward', 'dropout'])
    # Replace these values with the best ones from your W&B sweep
    best_config = config(d_model=256, nhead=8, num_encoder_layers=4, num_decoder_layers=4, dim_feedforward=1024, dropout=0.1)

    model = TransformerModel(
        input_vocab_size=len(input_vocab),
        output_vocab_size=len(output_vocab),
        d_model=best_config.d_model,
        nhead=best_config.nhead,
        num_encoder_layers=best_config.num_encoder_layers,
        num_decoder_layers=best_config.num_decoder_layers,
        dim_feedforward=best_config.dim_feedforward,
        dropout=best_config.dropout
    ).to(device)

    try:
        model.load_state_dict(torch.load('best_transformer_model.pth'))
        print("Successfully loaded the trained model.")
    except FileNotFoundError:
        print("Model file 'best_transformer_model.pth' not found. Please train a model and save it first.")
        exit()

    # --- Run Final Evaluation ---
    final_test_acc = evaluate_test_set(model, test_loader, device, output_vocab)
    print("\n--- Final Test Set Evaluation Results ---")
    print(f"Word-level Accuracy: {final_test_acc:.4f}")
    print("Predictions saved to test_predictions.csv")