In [1]:
import wandb
wandb.login(key="fb4c8007ed0d1fb692b2279b11bb69081f2c698d")

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mma23c014[0m ([33mma23c014-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Import Libraries

In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import wandb
from tqdm import tqdm

In [4]:
# Dataset utilities
class TransliterationDataset(Dataset):
    def __init__(self, pairs, input_vocab, output_vocab):
        self.pairs = pairs
        self.input_vocab = input_vocab
        self.output_vocab = output_vocab
        self.sos = output_vocab['<sos>']
        self.eos = output_vocab['<eos>']

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        source, target = self.pairs[idx]
        input_ids = [self.input_vocab[c] for c in source]
        target_ids = [self.sos] + [self.output_vocab[c] for c in target] + [self.eos]
        return torch.tensor(input_ids), torch.tensor(target_ids)

In [4]:
def build_vocab(pairs):
    input_chars = set()
    output_chars = set()
    for source, target in pairs:
        input_chars.update(source)
        output_chars.update(target)
    input_vocab = {c: i + 1 for i, c in enumerate(sorted(input_chars))}
    input_vocab['<pad>'] = 0
    output_vocab = {c: i + 3 for i, c in enumerate(sorted(output_chars))}
    output_vocab.update({'<pad>': 0, '<sos>': 1, '<eos>': 2})
    return input_vocab, output_vocab

def load_pairs(path):
    df = pd.read_csv(path, sep="\t", header=None, names=["target", "source", "count"], dtype=str)
    df.dropna(subset=["source", "target"], inplace=True)
    return list(zip(df["source"], df["target"]))

def collate_fn(batch):
    inputs, targets = zip(*batch)
    input_lens = [len(seq) for seq in inputs]
    target_lens = [len(seq) for seq in targets]
    inputs_padded = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)
    return inputs_padded, targets_padded, input_lens, target_lens

class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_layers, cell_type, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_size, embed_size, padding_idx=0)
        rnn_class = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[cell_type]
        self.rnn = rnn_class(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)

    def forward(self, x, lengths):
        x = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_size, embed_size, hidden_size, num_layers, cell_type, dropout):
        super().__init__()
        self.embedding = nn.Embedding(output_size, embed_size, padding_idx=0)
        rnn_class = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[cell_type]
        self.rnn = rnn_class(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input_token, hidden):
        x = self.embedding(input_token.unsqueeze(1))
        output, hidden = self.rnn(x, hidden)
        output = self.fc(output.squeeze(1))
        return output, hidden

    def beam_search(self, hidden, max_len, sos_idx, eos_idx, beam_size=3):
        device = next(self.parameters()).device
        sequences = [[torch.tensor([sos_idx], device=device), hidden, 0.0]]
        completed = []

        for _ in range(max_len):
            new_sequences = []
            for seq, h, score in sequences:
                input_token = seq[-1].unsqueeze(0)
                output, new_hidden = self.forward(input_token, h)
                probs = torch.log_softmax(output, dim=-1).squeeze(0)
                topk_probs, topk_indices = probs.topk(beam_size)
                for i in range(beam_size):
                    next_token = topk_indices[i].item()
                    new_score = score + topk_probs[i].item()
                    new_seq = torch.cat([seq, torch.tensor([next_token], device=device)])
                    new_sequences.append([new_seq, new_hidden, new_score])
            sequences = sorted(new_sequences, key=lambda x: x[2], reverse=True)[:beam_size]
            completed.extend([seq for seq in sequences if seq[0][-1].item() == eos_idx])
            sequences = [seq for seq in sequences if seq[0][-1].item() != eos_idx]
            if not sequences:
                break
        completed = sorted(completed, key=lambda x: x[2], reverse=True)
        return completed[0][0] if completed else sequences[0][0]

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, src_lens, tgt=None, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        device = src.device
        hidden = self.encoder(src, src_lens)
        if tgt is not None:
            tgt_len = tgt.size(1)
            outputs = torch.zeros(batch_size, tgt_len, self.decoder.fc.out_features, device=device)
            input_token = tgt[:, 0]
            for t in range(1, tgt_len):
                output, hidden = self.decoder(input_token, hidden)
                outputs[:, t] = output
                teacher_force = torch.rand(1).item() < teacher_forcing_ratio
                input_token = tgt[:, t] if teacher_force else output.argmax(1)
            return outputs
        else:
            return [self.decoder.beam_search(hidden, max_len=20, sos_idx=1, eos_idx=2) for _ in range(batch_size)]

def accuracy(preds, targets, pad_idx=0):
    pred_tokens = preds.argmax(dim=-1)
    correct = ((pred_tokens == targets) & (targets != pad_idx)).sum().item()
    total = (targets != pad_idx).sum().item()
    return correct / total if total > 0 else 0.0

def train(model, loader, optimizer, criterion, device):
    model.train()
    total_loss, total_acc = 0, 0
    for src, tgt, src_lens, tgt_lens in tqdm(loader, desc="Training", leave=False):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, src_lens, tgt)
        loss = criterion(output[:, 1:].reshape(-1, output.size(-1)), tgt[:, 1:].reshape(-1))
        acc = accuracy(output[:, 1:], tgt[:, 1:])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_acc += acc
    return total_loss / len(loader), total_acc / len(loader)

@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_acc = 0, 0
    for src, tgt, src_lens, tgt_lens in tqdm(loader, desc="Evaluating", leave=False):
        src, tgt = src.to(device), tgt.to(device)
        output = model(src, src_lens, tgt, teacher_forcing_ratio=0.0)
        loss = criterion(output[:, 1:].reshape(-1, output.size(-1)), tgt[:, 1:].reshape(-1))
        acc = accuracy(output[:, 1:], tgt[:, 1:])
        total_loss += loss.item()
        total_acc += acc
    return total_loss / len(loader), total_acc / len(loader)

def main():
    import wandb
    # Run name will be assigned after wandb.init with config
    def generate_run_name(config):
        return f"cell:{config.cell_type}_embed:{config.embed_size}_hid:{config.hidden_size}_layers:{config.num_layers}_beam:{config.beam_size}"

    # First initialize W&B run with placeholder name
    wandb.init(project="Dakshina-Translitration", config=wandb.config)
    config = wandb.config


    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv")
    dev_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv")

    input_vocab, output_vocab = build_vocab(train_pairs)
    train_dataset = TransliterationDataset(train_pairs, input_vocab, output_vocab)
    dev_dataset = TransliterationDataset(dev_pairs, input_vocab, output_vocab)

    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn)

    encoder = Encoder(len(input_vocab), config.embed_size, config.hidden_size, config.num_layers, config.cell_type, config.dropout)
    decoder = Decoder(len(output_vocab), config.embed_size, config.hidden_size, config.num_layers, config.cell_type, config.dropout)
    model = Seq2Seq(encoder, decoder).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    for epoch in range(10):
        train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
        val_loss, val_acc = evaluate(model, dev_loader, criterion, device)
        wandb.log({
            "epoch": epoch,
            "train_loss": train_loss,
            "train_accuracy": train_acc,
            "val_loss": val_loss,
            "val_accuracy": val_acc
        })


if __name__ == "__main__":
    sweep_config = {
        "method": "bayes",
        "metric": {"name": "val_accuracy", "goal": "maximize"},
        "parameters": {
            "embed_size": {"values": [32, 64, 128]},
            "hidden_size": {"values": [64, 128, 256]},
            "num_layers": {"values": [1,2,3]},
            "cell_type": {"values": ["RNN", "GRU", "LSTM"]},
            "dropout": {"values": [0.1,0.2, 0.3]},
            "lr": {"min": 0.0001, "max": 0.01},
            "batch_size": {"values": [16,32, 64]},
            "beam_size": {"values": [1, 3, 5]}  
        }
    }

    sweep_id = wandb.sweep(sweep_config, project="Dakshina-Translitration")
    wandb.agent(sweep_id, function=main, count=8)

Create sweep with ID: 2tyssrla
Sweep URL: https://wandb.ai/ma23c014-indian-institute-of-technology-madras/Dakshina-Translitration/sweeps/2tyssrla


[34m[1mwandb[0m: Agent Starting Run: 17tszmhk with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	embed_size: 128
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.001304138743761311
[34m[1mwandb[0m: 	num_layers: 1


                                                            

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▄▅▆▇▇▇███
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁▄▅▆▇▇▇███
val_loss,█▅▄▃▂▂▂▁▁▁

0,1
epoch,9.0
train_accuracy,0.72436
train_loss,0.91199
val_accuracy,0.61568
val_loss,1.33007


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: kr1oziud with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	embed_size: 32
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	lr: 0.008928603235359364
[34m[1mwandb[0m: 	num_layers: 2


                                                            

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▃▅▁▂▅▇█▇▇▇
train_loss,▇▄█▇▄▂▁▁▂▂
val_accuracy,▆▄▁▃▆▆█▇▇▇
val_loss,▄▅█▆▂▂▁▁▂▁

0,1
epoch,9.0
train_accuracy,0.61779
train_loss,1.23448
val_accuracy,0.54762
val_loss,1.50012


[34m[1mwandb[0m: Agent Starting Run: gtm2ijqi with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	lr: 0.007252656727280787
[34m[1mwandb[0m: 	num_layers: 2


                                                            

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▄▇██▇▇▇▇▇
train_loss,█▄▂▁▁▂▂▂▂▂
val_accuracy,▁▄▅█▆▂▄▆▅▄
val_loss,█▄▃▁▃▅▃▄▄▃

0,1
epoch,9.0
train_accuracy,0.28978
train_loss,2.71986
val_accuracy,0.24745
val_loss,2.9461


[34m[1mwandb[0m: Agent Starting Run: ol2iohhk with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.0071306615646824344
[34m[1mwandb[0m: 	num_layers: 1


                                                              

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇███████
train_loss,█▃▂▁▁▁▁▁▁▁
val_accuracy,▁▅▅█▇██▇▇▇
val_loss,█▄▃▁▂▂▂▂▁▁

0,1
epoch,9.0
train_accuracy,0.64433
train_loss,1.1502
val_accuracy,0.56188
val_loss,1.4729


[34m[1mwandb[0m: Agent Starting Run: 5urv4w9d with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.005812507068956667
[34m[1mwandb[0m: 	num_layers: 2


                                                              

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▅▆▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_accuracy,▁▄▆▆▇▇▇███
val_loss,█▄▃▂▂▁▂▁▁▁

0,1
epoch,9.0
train_accuracy,0.78461
train_loss,0.7041
val_accuracy,0.67656
val_loss,1.13033


[34m[1mwandb[0m: Agent Starting Run: az09bijt with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.007155022857331171
[34m[1mwandb[0m: 	num_layers: 3


                                                            

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▅▇▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_accuracy,▁▅▆▇▇▇████
val_loss,█▃▃▂▂▂▁▁▁▁

0,1
epoch,9.0
train_accuracy,0.7529
train_loss,0.80252
val_accuracy,0.68392
val_loss,1.10677


[34m[1mwandb[0m: Agent Starting Run: kqw98w3c with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.0024613924036533076
[34m[1mwandb[0m: 	num_layers: 2


                                                            

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▅▆▇▇▇████
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁▅▆▇▇█████
val_loss,█▄▂▁▁▂▂▁▂▂

0,1
epoch,9.0
train_accuracy,0.84783
train_loss,0.49896
val_accuracy,0.70821
val_loss,1.11037


[34m[1mwandb[0m: Agent Starting Run: fzz4hix9 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_size: 32
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.0005545456743751991
[34m[1mwandb[0m: 	num_layers: 2


                                                              

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▄▅▆▇▇▇███
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁▄▅▆▇▇▇███
val_loss,█▅▃▃▂▂▁▁▁▁

0,1
epoch,9.0
train_accuracy,0.7178
train_loss,0.91093
val_accuracy,0.64961
val_loss,1.16323


## Test Data

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import csv

# ---------------- Dataset & Utils ----------------
class TransliterationDataset(Dataset):
    def __init__(self, pairs, input_vocab, output_vocab):
        self.pairs = pairs
        self.input_vocab = input_vocab
        self.output_vocab = output_vocab
        self.sos = output_vocab['<sos>']
        self.eos = output_vocab['<eos>']

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        source, target = self.pairs[idx]
        input_ids = [self.input_vocab[c] for c in source]
        target_ids = [self.sos] + [self.output_vocab[c] for c in target] + [self.eos]
        return torch.tensor(input_ids), torch.tensor(target_ids)

def load_pairs(path):
    df = pd.read_csv(path, sep='\t', header=None, names=['target', 'source', 'count'], dtype=str)
    df.dropna(subset=["source", "target"], inplace=True)
    return list(zip(df['source'], df['target']))

def build_vocab(pairs):
    input_chars = set()
    output_chars = set()
    for src, tgt in pairs:
        input_chars.update(src)
        output_chars.update(tgt)
    input_vocab = {c: i+1 for i, c in enumerate(sorted(input_chars))}
    input_vocab['<pad>'] = 0
    output_vocab = {c: i+3 for i, c in enumerate(sorted(output_chars))}
    output_vocab.update({'<pad>': 0, '<sos>': 1, '<eos>': 2})
    return input_vocab, output_vocab

def collate_fn(batch):
    inputs, targets = zip(*batch)
    input_lens = [len(x) for x in inputs]
    target_lens = [len(x) for x in targets]
    inputs_padded = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)
    return inputs_padded, targets_padded, input_lens, target_lens

# ---------------- Models ----------------
class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_layers, cell_type, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_size, embed_size, padding_idx=0)
        rnn_cls = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[cell_type]
        self.rnn = rnn_cls(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_size, embed_size, hidden_size, num_layers, cell_type, dropout):
        super().__init__()
        self.embedding = nn.Embedding(output_size, embed_size, padding_idx=0)
        rnn_cls = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[cell_type]
        self.rnn = rnn_cls(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, token, hidden):
        x = self.embedding(token.unsqueeze(1))
        output, hidden = self.rnn(x, hidden)
        output = self.fc(output.squeeze(1))
        return output, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, src_lens, tgt=None, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        hidden = self.encoder(src, src_lens)
        tgt_len = tgt.size(1)
        outputs = torch.zeros(batch_size, tgt_len, self.decoder.fc.out_features).to(src.device)
        input_token = tgt[:, 0]
        for t in range(1, tgt_len):
            output, hidden = self.decoder(input_token, hidden)
            outputs[:, t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            input_token = tgt[:, t] if teacher_force else output.argmax(1)
        return outputs

# ---------------- Train + Eval ----------------
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src, tgt, src_lens, _ in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, src_lens, tgt)
        loss = criterion(output[:, 1:].reshape(-1, output.shape[-1]), tgt[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_and_save(model, dataloader, input_vocab, output_vocab, device, csv_path=None):
    model.eval()
    inv_input_vocab = {v: k for k, v in input_vocab.items()}
    inv_output_vocab = {v: k for k, v in output_vocab.items()}
    correct = 0
    total = 0
    results = []

    with torch.no_grad():
        for src, tgt, src_lens, _ in dataloader:
            src = src.to(device)
            hidden = model.encoder(src, src_lens)
            input_token = torch.tensor([output_vocab['<sos>']] * src.size(0)).to(device)
            decoded = []
            for _ in range(20):
                output, hidden = model.decoder(input_token, hidden)
                input_token = output.argmax(1)
                decoded.append(input_token)
            decoded = torch.stack(decoded, dim=1)

            for i in range(src.size(0)):
                pred = ''.join([inv_output_vocab[t.item()] for t in decoded[i] if t.item() not in [output_vocab['<eos>'], 0]])
                truth = ''.join([inv_output_vocab[t.item()] for t in tgt[i][1:-1]])
                inp = ''.join([inv_input_vocab[t.item()] for t in src[i] if t.item() != 0])
                results.append((inp, pred, truth))
                if pred == truth:
                    correct += 1
                total += 1

    acc = correct / total * 100
    print(f"\n Test Accuracy: {acc:.2f}%")
    for inp, pred, truth in results[:10]:
        print(f"{inp:<15} | Pred: {pred:<20} | Truth: {truth}")

    if csv_path is not None:
        with open(csv_path, mode='w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['Input', 'Prediction', 'GroundTruth'])
            writer.writerows(results)
        print(f"\n Predictions saved to: {csv_path}")

    return acc, results


# ---------------- Run ----------------
if __name__ == "__main__":
    config = {
        "embed_size": 128,
        "hidden_size": 256,
        "num_layers": 2,
        "cell_type": "LSTM",
        "dropout": 0.3,
        "batch_size": 64,
        "lr": 0.002461,
        "epochs": 10,
    }


    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv")
    test_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv")
    input_vocab, output_vocab = build_vocab(train_pairs)
    train_dataset = TransliterationDataset(train_pairs, input_vocab, output_vocab)
    test_dataset = TransliterationDataset(test_pairs, input_vocab, output_vocab)

    train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

    encoder = Encoder(len(input_vocab), config["embed_size"], config["hidden_size"],
                      config["num_layers"], config["cell_type"], config["dropout"])
    decoder = Decoder(len(output_vocab), config["embed_size"], config["hidden_size"],
                      config["num_layers"], config["cell_type"], config["dropout"])
    model = Seq2Seq(encoder, decoder).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    best_acc = 0
    for epoch in range(config["epochs"]):
        train_loss = train_model(model, train_loader, optimizer, criterion, device)
        print(f"Epoch {epoch+1} Train Loss: {train_loss:.4f}")
        acc, results = evaluate_and_save(model, test_loader, input_vocab, output_vocab, device, csv_path=None)
        if acc > best_acc:
            best_acc = acc
            torch.save(model.state_dict(), "best_model.pth")

    print("\n Loading best model for final evaluation...")
    model.load_state_dict(torch.load("best_model.pth"))

    # Save predictions CSV here
    evaluate_and_save(model, test_loader, input_vocab, output_vocab, device, csv_path="test_predictions.csv")

Epoch 1 Train Loss: 1.4576

 Test Accuracy: 23.01%
ank             | Pred: अंक                  | Truth: अंक
anka            | Pred: अंका                 | Truth: अंक
ankit           | Pred: अंकित                | Truth: अंकित
anakon          | Pred: अनाकों               | Truth: अंकों
ankhon          | Pred: अंखों                | Truth: अंकों
ankon           | Pred: अंकों                | Truth: अंकों
angkor          | Pred: अंगोकर               | Truth: अंकोर
ankor           | Pred: अंकोर                | Truth: अंकोर
angaarak        | Pred: अंगारक               | Truth: अंगारक
angarak         | Pred: अंगररक               | Truth: अंगारक
Epoch 2 Train Loss: 0.7836

 Test Accuracy: 30.90%
ank             | Pred: अंक                  | Truth: अंक
anka            | Pred: अंका                 | Truth: अंक
ankit           | Pred: अनकित                | Truth: अंकित
anakon          | Pred: अनकों                | Truth: अंकों
ankhon          | Pred: अंखों                | Truth: अंकों
anko

___
___

#  **$$Transformer-Model$$**

In [1]:
import numpy as np

In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import wandb
from tqdm import tqdm
import math
import csv
from collections import namedtuple

In [5]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import wandb
from tqdm import tqdm
import math
import csv
from collections import namedtuple

# ---------------- Data Processing and Utilities ----------------

class TransliterationDataset(Dataset):
    def __init__(self, pairs, input_vocab, output_vocab):
        self.pairs = pairs
        self.input_vocab = input_vocab
        self.output_vocab = output_vocab
        self.sos = output_vocab['<sos>']
        self.eos = output_vocab['<eos>']
        # Robustly get unk indices, with fallbacks
        self.unk_in = input_vocab.get('<unk>', 1) 
        self.unk_out = output_vocab.get('<unk>', 3)

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        source, target = self.pairs[idx]
        # Use .get() with unk_in/unk_out for handling unseen characters
        input_ids = [self.input_vocab.get(c, self.unk_in) for c in source]
        target_ids = [self.sos] + [self.output_vocab.get(c, self.unk_out) for c in target] + [self.eos]
        return torch.tensor(input_ids), torch.tensor(target_ids)

def build_vocab(pairs):
    input_chars = set()
    output_chars = set()
    for src, tgt in pairs:
        input_chars.update(src)
        output_chars.update(tgt)
    
    # Vocab indexing: <pad>:0, <unk>:1, then sorted chars
    input_vocab = {c: i + 2 for i, c in enumerate(sorted(input_chars))}
    input_vocab['<pad>'] = 0
    input_vocab['<unk>'] = 1
    
    # Vocab indexing: <pad>:0, <sos>:1, <eos>:2, <unk>:3, then sorted chars
    output_vocab = {c: i + 4 for i, c in enumerate(sorted(output_chars))}
    output_vocab.update({'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3})
    
    return input_vocab, output_vocab

def load_pairs(path):
    # Ensure the path is correct for your environment (e.g., Kaggle, local, Colab)
    # Common issue: FileNotFoundError if path is wrong.
    df = pd.read_csv(path, sep='\t', header=None, names=['target', 'source', 'count'], dtype=str)
    df.dropna(subset=["source", "target"], inplace=True)
    return list(zip(df['source'], df['target']))

def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs_padded = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)
    return inputs_padded, targets_padded

# ---------------- Transformer Specific Components ----------------

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0) # Add batch dimension
        self.register_buffer('pe', pe)

    def forward(self, x):
        # Add positional encoding to input embeddings
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, d_model, nhead, num_encoder_layers,
                 num_decoder_layers, dim_feedforward, dropout):
        super().__init__()
        
        self.d_model = d_model
        self.encoder_embedding = nn.Embedding(input_vocab_size, d_model, padding_idx=0)
        self.decoder_embedding = nn.Embedding(output_vocab_size, d_model, padding_idx=0)
        self.positional_encoding = PositionalEncoding(d_model, dropout)
        
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True # Important: Use batch_first for convenience
        )
        
        self.fc_out = nn.Linear(d_model, output_vocab_size)
        self.output_vocab_size = output_vocab_size
        self.sos_idx = 1
        self.eos_idx = 2

    def forward(self, src, tgt, src_mask=None, tgt_mask=None, src_padding_mask=None, tgt_padding_mask=None):
        # Embed and add positional encoding
        src_embedded = self.positional_encoding(self.encoder_embedding(src) * math.sqrt(self.d_model))
        tgt_embedded = self.positional_encoding(self.decoder_embedding(tgt) * math.sqrt(self.d_model))
        
        # Pass through Transformer layers
        transformer_out = self.transformer(
            src_embedded, tgt_embedded,
            src_mask=src_mask,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_padding_mask,
            tgt_key_padding_mask=tgt_padding_mask
        )
        
        # Linear layer to get vocabulary logits
        output = self.fc_out(transformer_out)
        return output

    def generate_square_subsequent_mask(self, sz):
        # Generates a mask to prevent attention to future tokens in the decoder
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def create_padding_mask(self, seq, pad_idx=0):
        # Generates a boolean mask for padding tokens
        return (seq == pad_idx)

# ---------------- Training and Evaluation Functions ----------------

def accuracy(preds, targets, pad_idx=0):
    # Calculates character-level accuracy, ignoring padding
    pred_tokens = preds.argmax(dim=-1)
    correct = ((pred_tokens == targets) & (targets != pad_idx)).sum().item()
    total = (targets != pad_idx).sum().item()
    return correct / total if total > 0 else 0.0

@torch.no_grad()
def evaluate_word_accuracy(model, dataloader, device, output_vocab):
    model.eval()
    correct_words = 0
    total_words = 0
    inv_output_vocab = {v: k for k, v in output_vocab.items()}
    
    for src, tgt in tqdm(dataloader, desc="Evaluating", leave=False):
        src, tgt = src.to(device), tgt.to(device)

        src_padding_mask = model.create_padding_mask(src).to(device)
        batch_size = src.size(0)
        max_len = 20 # Max length for generated output (could be dynamically set based on input length if needed)
        
        # Initialize decoder input with <sos> tokens for greedy decoding
        generated_tokens = torch.full((batch_size, 1), model.sos_idx, dtype=torch.long, device=device)
        
        for t in range(max_len):
            # Create masks for the current generated sequence length
            tgt_mask = model.generate_square_subsequent_mask(generated_tokens.size(1)).to(device)
            tgt_padding_mask = model.create_padding_mask(generated_tokens).to(device)
            
            # Forward pass to get next token predictions
            output = model(src, generated_tokens, src_padding_mask=src_padding_mask, tgt_mask=tgt_mask, tgt_padding_mask=tgt_padding_mask)
            
            # Get the token with the highest probability
            next_token = output[:, -1, :].argmax(dim=-1).unsqueeze(1)
            
            # Append the predicted token to the generated sequence
            generated_tokens = torch.cat([generated_tokens, next_token], dim=1)
            
            # Stop if all sequences in the batch have generated the <eos> token
            if (next_token == model.eos_idx).all():
                break

        # Calculate word-level accuracy
        for i in range(batch_size):
            pred_seq = generated_tokens[i]
            target_seq = tgt[i]
            
            # Find the first <eos> token to trim the sequence (excluding <sos> and <eos> itself)
            pred_end = (pred_seq == model.eos_idx).nonzero(as_tuple=True)[0]
            target_end = (target_seq == model.eos_idx).nonzero(as_tuple=True)[0]
            
            # Extract the actual word tokens, excluding <sos> and <eos>
            pred_word = pred_seq[1:pred_end[0] if pred_end.numel() > 0 else len(pred_seq)]
            target_word = target_seq[1:target_end[0] if target_end.numel() > 0 else len(target_seq)]

            if torch.equal(pred_word, target_word):
                correct_words += 1
            total_words += 1
            
    return correct_words / total_words if total_words > 0 else 0.0

def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss, total_char_acc = 0, 0
    for src, tgt in tqdm(loader, desc="Training", leave=False):
        src, tgt = src.to(device), tgt.to(device)

        optimizer.zero_grad()
        
        src_padding_mask = model.create_padding_mask(src).to(device)
        
        tgt_input = tgt[:, :-1] # Input for decoder, excludes the last token
        tgt_output = tgt[:, 1:]  # Target for loss, excludes the first token (<sos>)

        # ************ CRITICAL FIX ************
        # Create tgt_padding_mask from tgt_input to match its length
        tgt_padding_mask = model.create_padding_mask(tgt_input).to(device)
       
        
        tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).to(device)
        
        output = model(src, tgt_input, src_padding_mask=src_padding_mask, tgt_padding_mask=tgt_padding_mask, tgt_mask=tgt_mask)
        
        # Reshape output and target for CrossEntropyLoss
        loss = criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))
        char_acc = accuracy(output, tgt_output) # Character-level accuracy
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_char_acc += char_acc
        
    return total_loss / len(loader), total_char_acc / len(loader)

def generate_predictions_csv(model, dataloader, input_vocab, output_vocab, device, csv_path):
    model.eval()
    inv_input_vocab = {v: k for k, v in input_vocab.items()}
    inv_output_vocab = {v: k for k, v in output_vocab.items()}
    results = []

    with torch.no_grad():
        for src, tgt in tqdm(dataloader, desc="Generating Test Predictions"):
            src = src.to(device)
            batch_size = src.size(0)
            max_len = 20 # Max length for generated output

            # Inference loop for the decoder (similar to evaluate_word_accuracy)
            generated_tokens = torch.full((batch_size, 1), model.sos_idx, dtype=torch.long, device=device)
            
            for t in range(max_len):
                tgt_mask = model.generate_square_subsequent_mask(generated_tokens.size(1)).to(device)
                tgt_padding_mask = model.create_padding_mask(generated_tokens).to(device)
                
                output = model(src, generated_tokens, src_padding_mask=src_padding_mask, tgt_mask=tgt_mask, tgt_padding_mask=tgt_padding_mask)
                
                next_token = output[:, -1, :].argmax(dim=-1).unsqueeze(1)
                generated_tokens = torch.cat([generated_tokens, next_token], dim=1)
                
                if (next_token == model.eos_idx).all():
                    break

            for i in range(batch_size):
                pred_seq = generated_tokens[i]
                target_seq = tgt[i]
                
                pred_end = (pred_seq == model.eos_idx).nonzero(as_tuple=True)[0]
                target_end = (target_seq == model.eos_idx).nonzero(as_tuple=True)[0]
                
                pred_word_tokens = pred_seq[1:pred_end[0] if pred_end.numel() > 0 else len(pred_seq)]
                # Ensure truth_word_tokens also excludes any potential padding if it's shorter than predicted length
                truth_word_tokens = target_seq[1:target_end[0] if target_end.numel() > 0 else len(target_seq)]

                pred_str = ''.join([inv_output_vocab[t.item()] for t in pred_word_tokens if t.item() not in [model.sos_idx, model.eos_idx, 0]])
                truth_str = ''.join([inv_output_vocab[t.item()] for t in truth_word_tokens if t.item() not in [model.sos_idx, model.eos_idx, 0]])
                inp_str = ''.join([inv_input_vocab[t.item()] for t in src[i] if t.item() != 0])
                results.append((inp_str, pred_str, truth_str))
    
    with open(csv_path, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Input', 'Prediction', 'GroundTruth'])
        writer.writerows(results)
    print(f"\nPredictions saved to: {csv_path}")

# ---------------- Main Function for W&B Sweep ----------------

def main():
    import wandb
    
    def generate_run_name(config):
        return f"transformer_d:{config.d_model}_nhead:{config.nhead}_layers:{config.num_encoder_layers}"

    wandb.init(project="Dakshina-Translitration-Transformer", config=wandb.config)
    config = wandb.config
    wandb.run.name = generate_run_name(config)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv")
    dev_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv")
    test_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv")
    # *******************************************************************

    # Build vocab on train + dev pairs for consistency
    input_vocab, output_vocab = build_vocab(train_pairs + dev_pairs)
    train_dataset = TransliterationDataset(train_pairs, input_vocab, output_vocab)
    dev_dataset = TransliterationDataset(dev_pairs, input_vocab, output_vocab)
    test_dataset = TransliterationDataset(test_pairs, input_vocab, output_vocab) # Prepare test dataset here

    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn) # Batch size 1 for individual prediction

    model = TransformerModel(
        input_vocab_size=len(input_vocab),
        output_vocab_size=len(output_vocab),
        d_model=config.d_model,
        nhead=config.nhead,
        num_encoder_layers=config.num_encoder_layers,
        num_decoder_layers=config.num_decoder_layers,
        dim_feedforward=config.dim_feedforward,
        dropout=config.dropout
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, betas=(0.9, 0.98), eps=1e-9)
    criterion = nn.CrossEntropyLoss(ignore_index=0) # ignore_index=0 for <pad> token

    best_dev_acc = 0
    # Training loop
    for epoch in range(10): -
        train_loss, train_char_acc = train_epoch(model, train_loader, optimizer, criterion, device)
        dev_word_acc = evaluate_word_accuracy(model, dev_loader, device, output_vocab)
        
        print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Train Char Acc: {train_char_acc:.4f} | Dev Word Acc: {dev_word_acc:.4f}")
        
        if dev_word_acc > best_dev_acc:
            best_dev_acc = dev_word_acc
            torch.save(model.state_dict(), 'best_transformer_model.pth')
            print(f" -> New best model saved with dev word accuracy: {best_dev_acc:.4f}")

        wandb.log({
            "epoch": epoch,
            "train_loss": train_loss,
            "train_char_accuracy": train_char_acc,
            "dev_word_accuracy": dev_word_acc
        })

    print("\nTraining complete. Loading best model for final evaluation on test set...")
    # Load the best model found during training
    try:
        model.load_state_dict(torch.load('best_transformer_model.pth'))
    except FileNotFoundError:
        print("Error: 'best_transformer_model.pth' not found. Ensure training completed successfully and model was saved.")
        return # Exit main if model not found

    # Final evaluation on the test set (using the best saved model)
    final_test_word_acc = evaluate_word_accuracy(model, test_loader, device, output_vocab)
    print(f"\n--- Final Test Set Evaluation Results ---")
    print(f"Word-level Accuracy on Test Set: {final_test_word_acc:.4f}")
    
    # Generate and save predictions to CSV using the best model
    generate_predictions_csv(model, test_loader, input_vocab, output_vocab, device, csv_path="test_predictions.csv")
    print("Test predictions saved to test_predictions.csv")


if __name__ == "__main__":
    # Define your W&B sweep configuration
    sweep_config = {
        "method": "bayes", # Bayesian optimization
        "metric": {"name": "dev_word_accuracy", "goal": "maximize"},
        "parameters": {
            "d_model": {"values": [128, 256, 512]},
            "nhead": {"values": [4, 8, 16]},
            "num_encoder_layers": {"values": [2, 4]},
            "num_decoder_layers": {"values": [2, 4]},
            "dim_feedforward": {"values": [512, 1024, 2048]},
            "dropout": {"values": [0.1, 0.2, 0.3]},
            "lr": {"min": 0.0001, "max": 0.001},
            "batch_size": {"values": [16, 32, 64]}
        }
    }
    
    # Initialize and run the W&B agent
    sweep_id = wandb.sweep(sweep_config, project="Dakshina-Translitration-Transformer")
    wandb.agent(sweep_id, function=main, count=5) 

Create sweep with ID: 3x0zlqsl
Sweep URL: https://wandb.ai/ma23c014-indian-institute-of-technology-madras/Dakshina-Translitration-Transformer/sweeps/3x0zlqsl


[34m[1mwandb[0m: Agent Starting Run: 5vnrk5u5 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	d_model: 128
[34m[1mwandb[0m: 	dim_feedforward: 512
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	lr: 0.0006418468204446201
[34m[1mwandb[0m: 	nhead: 4
[34m[1mwandb[0m: 	num_decoder_layers: 2
[34m[1mwandb[0m: 	num_encoder_layers: 4


  output = torch._nested_tensor_from_mask(
                                                             

Epoch 1 | Train Loss: 1.9561 | Train Char Acc: 0.4013 | Dev Word Acc: 0.0083
 -> New best model saved with dev word accuracy: 0.0083


                                                             

Epoch 2 | Train Loss: 1.5652 | Train Char Acc: 0.4881 | Dev Word Acc: 0.0133
 -> New best model saved with dev word accuracy: 0.0133


                                                             

Epoch 3 | Train Loss: 1.4650 | Train Char Acc: 0.5164 | Dev Word Acc: 0.0135
 -> New best model saved with dev word accuracy: 0.0135


                                                             

Epoch 4 | Train Loss: 1.4066 | Train Char Acc: 0.5319 | Dev Word Acc: 0.0174
 -> New best model saved with dev word accuracy: 0.0174


                                                             

Epoch 5 | Train Loss: 1.3683 | Train Char Acc: 0.5441 | Dev Word Acc: 0.0195
 -> New best model saved with dev word accuracy: 0.0195


                                                             

Epoch 6 | Train Loss: 1.3418 | Train Char Acc: 0.5523 | Dev Word Acc: 0.0186


                                                             

Epoch 7 | Train Loss: 1.3187 | Train Char Acc: 0.5576 | Dev Word Acc: 0.0202
 -> New best model saved with dev word accuracy: 0.0202


                                                             

Epoch 8 | Train Loss: 1.3004 | Train Char Acc: 0.5644 | Dev Word Acc: 0.0243
 -> New best model saved with dev word accuracy: 0.0243


                                                             

Epoch 9 | Train Loss: 1.2874 | Train Char Acc: 0.5682 | Dev Word Acc: 0.0243


                                                             

Epoch 10 | Train Loss: 1.2714 | Train Char Acc: 0.5736 | Dev Word Acc: 0.0172

Training complete. Loading best model for final evaluation on test set...


                                                               


--- Final Test Set Evaluation Results ---
Word-level Accuracy on Test Set: 0.0293


Generating Test Predictions:   0%|          | 0/4502 [00:00<?, ?it/s]


0,1
dev_word_accuracy,▁▃▃▅▆▅▆██▅
epoch,▁▂▃▃▄▅▆▆▇█
train_char_accuracy,▁▅▆▆▇▇▇███
train_loss,█▄▃▂▂▂▁▁▁▁

0,1
dev_word_accuracy,0.01721
epoch,9.0
train_char_accuracy,0.57364
train_loss,1.27137


[34m[1mwandb[0m: [32m[41mERROR[0m Run 5vnrk5u5 errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 302, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_56/1848154019.py", line 361, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     generate_predictions_csv(model, test_loader, input_vocab, output_vocab, device, csv_path="test_predictions.csv")
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_56/1848154019.py", line 254, in generate_predictions_csv
[34m[1mwandb[0m: [32m[41mERROR[0m     output = model(src, generated_tokens, src_padding_mask=src_padding_mask, tgt_mask=tgt_mask, tgt_padding_mask=tgt_padding_mask)
[34m[1mwandb[0m: [32m[41mERROR[0m                                                            ^^^^^^^^^^^^^^^^
[

                                                           

Epoch 1 | Train Loss: 2.4921 | Train Char Acc: 0.2538 | Dev Word Acc: 0.0002
 -> New best model saved with dev word accuracy: 0.0002


                                                           

Epoch 2 | Train Loss: 1.9518 | Train Char Acc: 0.3652 | Dev Word Acc: 0.0014
 -> New best model saved with dev word accuracy: 0.0014


                                                           

Epoch 3 | Train Loss: 1.7862 | Train Char Acc: 0.4170 | Dev Word Acc: 0.0055
 -> New best model saved with dev word accuracy: 0.0055


                                                           

Epoch 4 | Train Loss: 1.6590 | Train Char Acc: 0.4572 | Dev Word Acc: 0.0032


                                                           

Epoch 5 | Train Loss: 1.5579 | Train Char Acc: 0.4885 | Dev Word Acc: 0.0044


                                                           

Epoch 6 | Train Loss: 1.4941 | Train Char Acc: 0.5073 | Dev Word Acc: 0.0062
 -> New best model saved with dev word accuracy: 0.0062


                                                           

Epoch 7 | Train Loss: 1.4401 | Train Char Acc: 0.5241 | Dev Word Acc: 0.0078
 -> New best model saved with dev word accuracy: 0.0078


                                                           

Epoch 8 | Train Loss: 1.4008 | Train Char Acc: 0.5364 | Dev Word Acc: 0.0087
 -> New best model saved with dev word accuracy: 0.0087


                                                           

Epoch 9 | Train Loss: 1.3695 | Train Char Acc: 0.5451 | Dev Word Acc: 0.0099
 -> New best model saved with dev word accuracy: 0.0099


                                                           

Epoch 10 | Train Loss: 1.3366 | Train Char Acc: 0.5561 | Dev Word Acc: 0.0060

Training complete. Loading best model for final evaluation on test set...


                                                               


--- Final Test Set Evaluation Results ---
Word-level Accuracy on Test Set: 0.0078


Generating Test Predictions:   0%|          | 0/4502 [00:00<?, ?it/s]


0,1
dev_word_accuracy,▁▂▅▃▄▅▇▇█▅
epoch,▁▂▃▃▄▅▆▆▇█
train_char_accuracy,▁▄▅▆▆▇▇███
train_loss,█▅▄▃▂▂▂▁▁▁

0,1
dev_word_accuracy,0.00597
epoch,9.0
train_char_accuracy,0.55605
train_loss,1.33656


[34m[1mwandb[0m: [32m[41mERROR[0m Run 56pt3olf errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 302, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_56/1848154019.py", line 361, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     generate_predictions_csv(model, test_loader, input_vocab, output_vocab, device, csv_path="test_predictions.csv")
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_56/1848154019.py", line 254, in generate_predictions_csv
[34m[1mwandb[0m: [32m[41mERROR[0m     output = model(src, generated_tokens, src_padding_mask=src_padding_mask, tgt_mask=tgt_mask, tgt_padding_mask=tgt_padding_mask)
[34m[1mwandb[0m: [32m[41mERROR[0m                                                            ^^^^^^^^^^^^^^^^
[

                                                             

Epoch 1 | Train Loss: 1.6366 | Train Char Acc: 0.4832 | Dev Word Acc: 0.0076
 -> New best model saved with dev word accuracy: 0.0076


                                                             

Epoch 2 | Train Loss: 1.2648 | Train Char Acc: 0.5743 | Dev Word Acc: 0.0101
 -> New best model saved with dev word accuracy: 0.0101


                                                             

Epoch 3 | Train Loss: 1.1536 | Train Char Acc: 0.6090 | Dev Word Acc: 0.0154
 -> New best model saved with dev word accuracy: 0.0154


                                                             

Epoch 4 | Train Loss: 1.0744 | Train Char Acc: 0.6347 | Dev Word Acc: 0.0156
 -> New best model saved with dev word accuracy: 0.0156


                                                             

Epoch 5 | Train Loss: 1.0096 | Train Char Acc: 0.6555 | Dev Word Acc: 0.0193
 -> New best model saved with dev word accuracy: 0.0193


                                                             

Epoch 6 | Train Loss: 0.9556 | Train Char Acc: 0.6742 | Dev Word Acc: 0.0278
 -> New best model saved with dev word accuracy: 0.0278


                                                             

Epoch 7 | Train Loss: 0.9077 | Train Char Acc: 0.6910 | Dev Word Acc: 0.0248


                                                             

Epoch 8 | Train Loss: 0.8675 | Train Char Acc: 0.7042 | Dev Word Acc: 0.0321
 -> New best model saved with dev word accuracy: 0.0321


                                                             

Epoch 9 | Train Loss: 0.8348 | Train Char Acc: 0.7148 | Dev Word Acc: 0.0365
 -> New best model saved with dev word accuracy: 0.0365


                                                             

Epoch 10 | Train Loss: 0.8006 | Train Char Acc: 0.7263 | Dev Word Acc: 0.0328

Training complete. Loading best model for final evaluation on test set...


                                                               


--- Final Test Set Evaluation Results ---
Word-level Accuracy on Test Set: 0.0271


Generating Test Predictions:   0%|          | 0/4502 [00:00<?, ?it/s]


0,1
dev_word_accuracy,▁▂▃▃▄▆▅▇█▇
epoch,▁▂▃▃▄▅▆▆▇█
train_char_accuracy,▁▄▅▅▆▇▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁

0,1
dev_word_accuracy,0.03281
epoch,9.0
train_char_accuracy,0.72634
train_loss,0.80065


[34m[1mwandb[0m: [32m[41mERROR[0m Run rnzdibxb errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 302, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_56/1848154019.py", line 361, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     generate_predictions_csv(model, test_loader, input_vocab, output_vocab, device, csv_path="test_predictions.csv")
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_56/1848154019.py", line 254, in generate_predictions_csv
[34m[1mwandb[0m: [32m[41mERROR[0m     output = model(src, generated_tokens, src_padding_mask=src_padding_mask, tgt_mask=tgt_mask, tgt_padding_mask=tgt_padding_mask)
[34m[1mwandb[0m: [32m[41mERROR[0m                                                            ^^^^^^^^^^^^^^^^
[

                                                             

Epoch 1 | Train Loss: 1.6696 | Train Char Acc: 0.4764 | Dev Word Acc: 0.0092
 -> New best model saved with dev word accuracy: 0.0092


                                                             

Epoch 2 | Train Loss: 1.2850 | Train Char Acc: 0.5752 | Dev Word Acc: 0.0218
 -> New best model saved with dev word accuracy: 0.0218


                                                             

Epoch 3 | Train Loss: 1.1436 | Train Char Acc: 0.6209 | Dev Word Acc: 0.0301
 -> New best model saved with dev word accuracy: 0.0301


                                                             

Epoch 4 | Train Loss: 1.0230 | Train Char Acc: 0.6599 | Dev Word Acc: 0.0468
 -> New best model saved with dev word accuracy: 0.0468


                                                             

Epoch 5 | Train Loss: 0.9055 | Train Char Acc: 0.6980 | Dev Word Acc: 0.0863
 -> New best model saved with dev word accuracy: 0.0863


                                                             

Epoch 6 | Train Loss: 0.8051 | Train Char Acc: 0.7310 | Dev Word Acc: 0.0950
 -> New best model saved with dev word accuracy: 0.0950


                                                             

Epoch 7 | Train Loss: 0.7260 | Train Char Acc: 0.7573 | Dev Word Acc: 0.1065
 -> New best model saved with dev word accuracy: 0.1065


                                                             

Epoch 8 | Train Loss: 0.6621 | Train Char Acc: 0.7771 | Dev Word Acc: 0.1487
 -> New best model saved with dev word accuracy: 0.1487


                                                             

Epoch 9 | Train Loss: 0.6129 | Train Char Acc: 0.7949 | Dev Word Acc: 0.1333


                                                             

Epoch 10 | Train Loss: 0.5739 | Train Char Acc: 0.8080 | Dev Word Acc: 0.1592
 -> New best model saved with dev word accuracy: 0.1592

Training complete. Loading best model for final evaluation on test set...


                                                               


--- Final Test Set Evaluation Results ---
Word-level Accuracy on Test Set: 0.1357


Generating Test Predictions:   0%|          | 0/4502 [00:00<?, ?it/s]


0,1
dev_word_accuracy,▁▂▂▃▅▅▆█▇█
epoch,▁▂▃▃▄▅▆▆▇█
train_char_accuracy,▁▃▄▅▆▆▇▇██
train_loss,█▆▅▄▃▂▂▂▁▁

0,1
dev_word_accuracy,0.15925
epoch,9.0
train_char_accuracy,0.80795
train_loss,0.57392


[34m[1mwandb[0m: [32m[41mERROR[0m Run fm9x0atm errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 302, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_56/1848154019.py", line 361, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     generate_predictions_csv(model, test_loader, input_vocab, output_vocab, device, csv_path="test_predictions.csv")
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_56/1848154019.py", line 254, in generate_predictions_csv
[34m[1mwandb[0m: [32m[41mERROR[0m     output = model(src, generated_tokens, src_padding_mask=src_padding_mask, tgt_mask=tgt_mask, tgt_padding_mask=tgt_padding_mask)
[34m[1mwandb[0m: [32m[41mERROR[0m                                                            ^^^^^^^^^^^^^^^^
[

                                                             

Epoch 1 | Train Loss: 2.2757 | Train Char Acc: 0.3377 | Dev Word Acc: 0.0009
 -> New best model saved with dev word accuracy: 0.0009


                                                             

Epoch 2 | Train Loss: 1.6777 | Train Char Acc: 0.4626 | Dev Word Acc: 0.0025
 -> New best model saved with dev word accuracy: 0.0025


                                                             

Epoch 3 | Train Loss: 1.5013 | Train Char Acc: 0.5086 | Dev Word Acc: 0.0037
 -> New best model saved with dev word accuracy: 0.0037


                                                             

Epoch 4 | Train Loss: 1.4154 | Train Char Acc: 0.5313 | Dev Word Acc: 0.0046
 -> New best model saved with dev word accuracy: 0.0046


                                                             

Epoch 5 | Train Loss: 1.3549 | Train Char Acc: 0.5491 | Dev Word Acc: 0.0050
 -> New best model saved with dev word accuracy: 0.0050


                                                             

Epoch 6 | Train Loss: 1.3092 | Train Char Acc: 0.5622 | Dev Word Acc: 0.0034


                                                             

Epoch 7 | Train Loss: 1.2727 | Train Char Acc: 0.5721 | Dev Word Acc: 0.0053
 -> New best model saved with dev word accuracy: 0.0053


                                                             

Epoch 8 | Train Loss: 1.2431 | Train Char Acc: 0.5833 | Dev Word Acc: 0.0032


                                                             

Epoch 9 | Train Loss: 1.2183 | Train Char Acc: 0.5903 | Dev Word Acc: 0.0071
 -> New best model saved with dev word accuracy: 0.0071


                                                             

Epoch 10 | Train Loss: 1.1971 | Train Char Acc: 0.5961 | Dev Word Acc: 0.0071

Training complete. Loading best model for final evaluation on test set...


                                                               


--- Final Test Set Evaluation Results ---
Word-level Accuracy on Test Set: 0.0104


Generating Test Predictions:   0%|          | 0/4502 [00:00<?, ?it/s]


0,1
dev_word_accuracy,▁▃▄▅▆▄▆▄██
epoch,▁▂▃▃▄▅▆▆▇█
train_char_accuracy,▁▄▆▆▇▇▇███
train_loss,█▄▃▂▂▂▁▁▁▁

0,1
dev_word_accuracy,0.00711
epoch,9.0
train_char_accuracy,0.5961
train_loss,1.19713


[34m[1mwandb[0m: [32m[41mERROR[0m Run c2zi2lbu errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 302, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_56/1848154019.py", line 361, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     generate_predictions_csv(model, test_loader, input_vocab, output_vocab, device, csv_path="test_predictions.csv")
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_56/1848154019.py", line 254, in generate_predictions_csv
[34m[1mwandb[0m: [32m[41mERROR[0m     output = model(src, generated_tokens, src_padding_mask=src_padding_mask, tgt_mask=tgt_mask, tgt_padding_mask=tgt_padding_mask)
[34m[1mwandb[0m: [32m[41mERROR[0m                                                            ^^^^^^^^^^^^^^^^
[

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import wandb
from tqdm import tqdm
import math
import csv
from collections import namedtuple

# ---------------- Data Processing and Utilities ----------------

class TransliterationDataset(Dataset):
    def __init__(self, pairs, input_vocab, output_vocab):
        self.pairs = pairs
        self.input_vocab = input_vocab
        self.output_vocab = output_vocab
        self.sos = output_vocab['<sos>']
        self.eos = output_vocab['<eos>']
        self.unk_in = input_vocab.get('<unk>', 1)
        self.unk_out = output_vocab.get('<unk>', 3)

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        source, target = self.pairs[idx]
        input_ids = [self.input_vocab.get(c, self.unk_in) for c in source]
        target_ids = [self.sos] + [self.output_vocab.get(c, self.unk_out) for c in target] + [self.eos]
        return torch.tensor(input_ids), torch.tensor(target_ids)

def build_vocab(pairs):
    input_chars = set()
    output_chars = set()
    for src, tgt in pairs:
        input_chars.update(src)
        output_chars.update(tgt)
    
    input_vocab = {c: i + 2 for i, c in enumerate(sorted(input_chars))}
    input_vocab['<pad>'] = 0
    input_vocab['<unk>'] = 1
    
    output_vocab = {c: i + 4 for i, c in enumerate(sorted(output_chars))}
    output_vocab.update({'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3})
    
    return input_vocab, output_vocab

def load_pairs(path):
    df = pd.read_csv(path, sep='\t', header=None, names=['target', 'source', 'count'], dtype=str)
    df.dropna(subset=["source", "target"], inplace=True)
    return list(zip(df['source'], df['target']))

def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs_padded = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)
    return inputs_padded, targets_padded

# ---------------- Transformer Specific Components ----------------

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, d_model, nhead, num_encoder_layers,
                 num_decoder_layers, dim_feedforward, dropout):
        super().__init__()
        
        self.d_model = d_model
        self.encoder_embedding = nn.Embedding(input_vocab_size, d_model, padding_idx=0)
        self.decoder_embedding = nn.Embedding(output_vocab_size, d_model, padding_idx=0)
        self.positional_encoding = PositionalEncoding(d_model, dropout)
        
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        
        self.fc_out = nn.Linear(d_model, output_vocab_size)
        self.output_vocab_size = output_vocab_size
        self.sos_idx = 1
        self.eos_idx = 2

    # Corrected forward method signature
    def forward(self, src, tgt, src_mask=None, tgt_mask=None, src_key_padding_mask=None, tgt_key_padding_mask=None):
        src_embedded = self.positional_encoding(self.encoder_embedding(src) * math.sqrt(self.d_model))
        tgt_embedded = self.positional_encoding(self.decoder_embedding(tgt) * math.sqrt(self.d_model))
        
        transformer_out = self.transformer(
            src_embedded, tgt_embedded,
            src_mask=src_mask,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )
        
        output = self.fc_out(transformer_out)
        return output

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def create_padding_mask(self, seq, pad_idx=0):
        return (seq == pad_idx)

# ---------------- Training and Evaluation Functions ----------------

def accuracy(preds, targets, pad_idx=0):
    pred_tokens = preds.argmax(dim=-1)
    correct = ((pred_tokens == targets) & (targets != pad_idx)).sum().item()
    total = (targets != pad_idx).sum().item()
    return correct / total if total > 0 else 0.0

@torch.no_grad()
def evaluate_word_accuracy(model, dataloader, device, output_vocab):
    model.eval()
    correct_words = 0
    total_words = 0
    inv_output_vocab = {v: k for k, v in output_vocab.items()}
    
    for src, tgt in tqdm(dataloader, desc="Evaluating", leave=False):
        src, tgt = src.to(device), tgt.to(device)

        src_padding_mask = model.create_padding_mask(src).to(device)
        batch_size = src.size(0)
        max_len = 20
        
        generated_tokens = torch.full((batch_size, 1), model.sos_idx, dtype=torch.long, device=device)
        
        for t in range(max_len):
            tgt_mask = model.generate_square_subsequent_mask(generated_tokens.size(1)).to(device)
            tgt_padding_mask = model.create_padding_mask(generated_tokens).to(device)
            
            # Corrected keyword arguments
            output = model(src, generated_tokens, 
                           src_key_padding_mask=src_padding_mask, 
                           tgt_key_padding_mask=tgt_padding_mask, 
                           tgt_mask=tgt_mask)
            
            next_token = output[:, -1, :].argmax(dim=-1).unsqueeze(1)
            generated_tokens = torch.cat([generated_tokens, next_token], dim=1)
            
            if (next_token == model.eos_idx).all():
                break

        for i in range(batch_size):
            pred_seq = generated_tokens[i]
            target_seq = tgt[i]
            
            pred_end = (pred_seq == model.eos_idx).nonzero(as_tuple=True)[0]
            target_end = (target_seq == model.eos_idx).nonzero(as_tuple=True)[0]
            
            pred_word = pred_seq[1:pred_end[0] if pred_end.numel() > 0 else len(pred_seq)]
            target_word = target_seq[1:target_end[0] if target_end.numel() > 0 else len(target_seq)]

            if torch.equal(pred_word, target_word):
                correct_words += 1
            total_words += 1
            
    return correct_words / total_words if total_words > 0 else 0.0

def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss, total_char_acc = 0, 0
    for src, tgt in tqdm(loader, desc="Training", leave=False):
        src, tgt = src.to(device), tgt.to(device)

        optimizer.zero_grad()
        
        src_padding_mask = model.create_padding_mask(src).to(device)
        tgt_input = tgt[:, :-1]
        
        tgt_padding_mask = model.create_padding_mask(tgt_input).to(device)
        
        tgt_output = tgt[:, 1:]

        tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).to(device)
        
        # Corrected keyword arguments
        output = model(src, tgt_input, 
                       src_key_padding_mask=src_padding_mask, 
                       tgt_key_padding_mask=tgt_padding_mask, 
                       tgt_mask=tgt_mask)
        
        loss = criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))
        char_acc = accuracy(output, tgt_output)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_char_acc += char_acc
        
    return total_loss / len(loader), total_char_acc / len(loader)

def generate_predictions_csv(model, dataloader, input_vocab, output_vocab, device, csv_path):
    model.eval()
    inv_input_vocab = {v: k for k, v in input_vocab.items()}
    inv_output_vocab = {v: k for k, v in output_vocab.items()}
    results = []

    with torch.no_grad():
        for src, tgt in tqdm(dataloader, desc="Generating Test Predictions"):
            src = src.to(device)
            batch_size = src.size(0)
            max_len = 20

            src_padding_mask = model.create_padding_mask(src).to(device)
            generated_tokens = torch.full((batch_size, 1), model.sos_idx, dtype=torch.long, device=device)
            
            for t in range(max_len):
                tgt_mask = model.generate_square_subsequent_mask(generated_tokens.size(1)).to(device)
                tgt_padding_mask = model.create_padding_mask(generated_tokens).to(device)
                
                # Corrected keyword arguments
                output = model(src, generated_tokens, 
                               src_key_padding_mask=src_padding_mask, 
                               tgt_key_padding_mask=tgt_padding_mask, 
                               tgt_mask=tgt_mask)
                
                next_token = output[:, -1, :].argmax(dim=-1).unsqueeze(1)
                generated_tokens = torch.cat([generated_tokens, next_token], dim=1)
                
                if (next_token == model.eos_idx).all():
                    break

            for i in range(batch_size):
                pred_seq = generated_tokens[i]
                target_seq = tgt[i]
                
                pred_end = (pred_seq == model.eos_idx).nonzero(as_tuple=True)[0]
                target_end = (target_seq == model.eos_idx).nonzero(as_tuple=True)[0]
                
                pred_word_tokens = pred_seq[1:pred_end[0] if pred_end.numel() > 0 else len(pred_seq)]
                truth_word_tokens = target_seq[1:target_end[0] if target_end.numel() > 0 else len(target_seq)]

                pred_str = ''.join([inv_output_vocab[t.item()] for t in pred_word_tokens if t.item() not in [model.sos_idx, model.eos_idx, 0]])
                truth_str = ''.join([inv_output_vocab[t.item()] for t in truth_word_tokens if t.item() not in [model.sos_idx, model.eos_idx, 0]])
                inp_str = ''.join([inv_input_vocab[t.item()] for t in src[i] if t.item() != 0])
                results.append((inp_str, pred_str, truth_str))
    
    with open(csv_path, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Input', 'Prediction', 'GroundTruth'])
        writer.writerows(results)
    print(f"\nPredictions saved to: {csv_path}")

# ---------------- Main Function for W&B Sweep ----------------

def main():
    import wandb
    
    def generate_run_name(config):
        return f"transformer_d:{config.d_model}_nhead:{config.nhead}_layers:{config.num_encoder_layers}"

    wandb.init(project="Dakshina-Translitration-Transformer", config=wandb.config)
    config = wandb.config
    wandb.run.name = generate_run_name(config)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv")
    dev_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv")
    test_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv")

    # Build vocab on train + dev pairs for consistency
    input_vocab, output_vocab = build_vocab(train_pairs + dev_pairs)
    train_dataset = TransliterationDataset(train_pairs, input_vocab, output_vocab)
    dev_dataset = TransliterationDataset(dev_pairs, input_vocab, output_vocab)
    test_dataset = TransliterationDataset(test_pairs, input_vocab, output_vocab)

    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

    model = TransformerModel(
        input_vocab_size=len(input_vocab),
        output_vocab_size=len(output_vocab),
        d_model=config.d_model,
        nhead=config.nhead,
        num_encoder_layers=config.num_encoder_layers,
        num_decoder_layers=config.num_decoder_layers,
        dim_feedforward=config.dim_feedforward,
        dropout=config.dropout
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, betas=(0.9, 0.98), eps=1e-9)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    best_dev_acc = 0
    # Training loop
    for epoch in range(8):
        train_loss, train_char_acc = train_epoch(model, train_loader, optimizer, criterion, device)
        dev_word_acc = evaluate_word_accuracy(model, dev_loader, device, output_vocab)
        
        print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Train Char Acc: {train_char_acc:.4f} | Dev Word Acc: {dev_word_acc:.4f}")
        
        if dev_word_acc > best_dev_acc:
            best_dev_acc = dev_word_acc
            # Save the model with a unique filename using the W&B run ID
            model_path = f'best_transformer_model_{wandb.run.id}.pth'
            torch.save(model.state_dict(), model_path)
            print(f" -> New best model saved to {model_path} with dev word accuracy: {best_dev_acc:.4f}")

        wandb.log({
            "epoch": epoch,
            "train_loss": train_loss,
            "train_char_accuracy": train_char_acc,
            "dev_word_accuracy": dev_word_acc
        })

    print("\nTraining complete. Loading best model for final evaluation on test set...")
    
    # Load the best model found during this run
    try:
        model_path = f'best_transformer_model_{wandb.run.id}.pth'
        model.load_state_dict(torch.load(model_path))
    except FileNotFoundError:
        print("Error: 'best_transformer_model.pth' not found. Ensure training completed successfully and model was saved.")
        return

    final_test_word_acc = evaluate_word_accuracy(model, test_loader, device, output_vocab)
    print(f"\n--- Final Test Set Evaluation Results ---")
    print(f"Word-level Accuracy on Test Set: {final_test_word_acc:.4f}")
    
    generate_predictions_csv(model, test_loader, input_vocab, output_vocab, device, csv_path="test_predictions.csv")
    print("Test predictions saved to test_predictions.csv")

if __name__ == "__main__":
    sweep_config = {
        "method": "bayes",
        "metric": {"name": "dev_word_accuracy", "goal": "maximize"},
        "parameters": {
            "d_model": {"values": [128, 256, 512]},
            "nhead": {"values": [4, 8, 16]},
            "num_encoder_layers": {"values": [2, 4]},
            "num_decoder_layers": {"values": [2, 4]},
            "dim_feedforward": {"values": [512, 1024, 2048]},
            "dropout": {"values": [0.1, 0.2, 0.3]},
            "lr": {"min": 0.0001, "max": 0.001},
            "batch_size": {"values": [16, 32, 64]}
        }
    }
    
    sweep_id = wandb.sweep(sweep_config, project="Dakshina-Translitration-Transformer")
    wandb.agent(sweep_id, function=main, count=4)

Create sweep with ID: 1ht7n35v
Sweep URL: https://wandb.ai/ma23c014-indian-institute-of-technology-madras/Dakshina-Translitration-Transformer/sweeps/1ht7n35v


[34m[1mwandb[0m: Agent Starting Run: 45pzphlu with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	d_model: 512
[34m[1mwandb[0m: 	dim_feedforward: 512
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	lr: 0.00043491182926006633
[34m[1mwandb[0m: 	nhead: 16
[34m[1mwandb[0m: 	num_decoder_layers: 2
[34m[1mwandb[0m: 	num_encoder_layers: 2


  output = torch._nested_tensor_from_mask(
                                                             

Epoch 1 | Train Loss: 1.6346 | Train Char Acc: 0.4752 | Dev Word Acc: 0.0126
 -> New best model saved to best_transformer_model_45pzphlu.pth with dev word accuracy: 0.0126


                                                             

Epoch 2 | Train Loss: 1.3989 | Train Char Acc: 0.5331 | Dev Word Acc: 0.0108


                                                             

Epoch 3 | Train Loss: 1.3117 | Train Char Acc: 0.5576 | Dev Word Acc: 0.0223
 -> New best model saved to best_transformer_model_45pzphlu.pth with dev word accuracy: 0.0223


                                                             

Epoch 4 | Train Loss: 1.2548 | Train Char Acc: 0.5751 | Dev Word Acc: 0.0154


                                                             

Epoch 5 | Train Loss: 1.2121 | Train Char Acc: 0.5881 | Dev Word Acc: 0.0193


                                                             

Epoch 6 | Train Loss: 1.1810 | Train Char Acc: 0.5984 | Dev Word Acc: 0.0179


                                                             

Epoch 7 | Train Loss: 1.1485 | Train Char Acc: 0.6089 | Dev Word Acc: 0.0252
 -> New best model saved to best_transformer_model_45pzphlu.pth with dev word accuracy: 0.0252


                                                             

Epoch 8 | Train Loss: 1.1241 | Train Char Acc: 0.6167 | Dev Word Acc: 0.0151

Training complete. Loading best model for final evaluation on test set...


                                                               


--- Final Test Set Evaluation Results ---
Word-level Accuracy on Test Set: 0.0295


Generating Test Predictions: 100%|██████████| 4502/4502 [02:59<00:00, 25.12it/s]



Predictions saved to: test_predictions.csv
Test predictions saved to test_predictions.csv


0,1
dev_word_accuracy,▂▁▇▃▅▄█▃
epoch,▁▂▃▄▅▆▇█
train_char_accuracy,▁▄▅▆▇▇██
train_loss,█▅▄▃▂▂▁▁

0,1
dev_word_accuracy,0.01514
epoch,7.0
train_char_accuracy,0.61672
train_loss,1.12414


[34m[1mwandb[0m: Agent Starting Run: d9h46hb7 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	d_model: 256
[34m[1mwandb[0m: 	dim_feedforward: 1024
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	lr: 0.0005251776156309645
[34m[1mwandb[0m: 	nhead: 16
[34m[1mwandb[0m: 	num_decoder_layers: 4
[34m[1mwandb[0m: 	num_encoder_layers: 4


                                                             

Epoch 1 | Train Loss: 1.6748 | Train Char Acc: 0.4716 | Dev Word Acc: 0.0057
 -> New best model saved to best_transformer_model_d9h46hb7.pth with dev word accuracy: 0.0057


                                                             

Epoch 2 | Train Loss: 1.2871 | Train Char Acc: 0.5692 | Dev Word Acc: 0.0122
 -> New best model saved to best_transformer_model_d9h46hb7.pth with dev word accuracy: 0.0122


                                                             

Epoch 3 | Train Loss: 1.1822 | Train Char Acc: 0.6012 | Dev Word Acc: 0.0227
 -> New best model saved to best_transformer_model_d9h46hb7.pth with dev word accuracy: 0.0227


                                                             

Epoch 4 | Train Loss: 1.1137 | Train Char Acc: 0.6234 | Dev Word Acc: 0.0241
 -> New best model saved to best_transformer_model_d9h46hb7.pth with dev word accuracy: 0.0241


                                                             

Epoch 5 | Train Loss: 1.0556 | Train Char Acc: 0.6410 | Dev Word Acc: 0.0186


                                                             

Epoch 6 | Train Loss: 1.0144 | Train Char Acc: 0.6548 | Dev Word Acc: 0.0273
 -> New best model saved to best_transformer_model_d9h46hb7.pth with dev word accuracy: 0.0273


                                                             

Epoch 7 | Train Loss: 0.9776 | Train Char Acc: 0.6673 | Dev Word Acc: 0.0202


                                                             

Epoch 8 | Train Loss: 0.9483 | Train Char Acc: 0.6770 | Dev Word Acc: 0.0211

Training complete. Loading best model for final evaluation on test set...


                                                               


--- Final Test Set Evaluation Results ---
Word-level Accuracy on Test Set: 0.0198


Generating Test Predictions: 100%|██████████| 4502/4502 [05:12<00:00, 14.41it/s]



Predictions saved to: test_predictions.csv
Test predictions saved to test_predictions.csv


0,1
dev_word_accuracy,▁▃▇▇▅█▆▆
epoch,▁▂▃▄▅▆▇█
train_char_accuracy,▁▄▅▆▇▇██
train_loss,█▄▃▃▂▂▁▁

0,1
dev_word_accuracy,0.02111
epoch,7.0
train_char_accuracy,0.677
train_loss,0.94827


[34m[1mwandb[0m: Agent Starting Run: 46au1jj8 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 512
[34m[1mwandb[0m: 	dim_feedforward: 1024
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	lr: 0.0005160100581465826
[34m[1mwandb[0m: 	nhead: 8
[34m[1mwandb[0m: 	num_decoder_layers: 4
[34m[1mwandb[0m: 	num_encoder_layers: 4


                                                           

Epoch 1 | Train Loss: 2.6923 | Train Char Acc: 0.2304 | Dev Word Acc: 0.0007
 -> New best model saved to best_transformer_model_46au1jj8.pth with dev word accuracy: 0.0007


                                                           

Epoch 2 | Train Loss: 2.1088 | Train Char Acc: 0.3287 | Dev Word Acc: 0.0016
 -> New best model saved to best_transformer_model_46au1jj8.pth with dev word accuracy: 0.0016


                                                           

Epoch 3 | Train Loss: 2.0317 | Train Char Acc: 0.3453 | Dev Word Acc: 0.0041
 -> New best model saved to best_transformer_model_46au1jj8.pth with dev word accuracy: 0.0041


                                                           

Epoch 4 | Train Loss: 1.9745 | Train Char Acc: 0.3592 | Dev Word Acc: 0.0085
 -> New best model saved to best_transformer_model_46au1jj8.pth with dev word accuracy: 0.0085


                                                           

Epoch 5 | Train Loss: 1.9373 | Train Char Acc: 0.3679 | Dev Word Acc: 0.0060


                                                           

Epoch 6 | Train Loss: 1.9193 | Train Char Acc: 0.3732 | Dev Word Acc: 0.0048


                                                           

Epoch 7 | Train Loss: 1.8913 | Train Char Acc: 0.3809 | Dev Word Acc: 0.0076


                                                           

Epoch 8 | Train Loss: 1.8595 | Train Char Acc: 0.3888 | Dev Word Acc: 0.0108
 -> New best model saved to best_transformer_model_46au1jj8.pth with dev word accuracy: 0.0108

Training complete. Loading best model for final evaluation on test set...


                                                               


--- Final Test Set Evaluation Results ---
Word-level Accuracy on Test Set: 0.0102


Generating Test Predictions: 100%|██████████| 4502/4502 [04:02<00:00, 18.55it/s]



Predictions saved to: test_predictions.csv
Test predictions saved to test_predictions.csv


0,1
dev_word_accuracy,▁▂▃▆▅▄▆█
epoch,▁▂▃▄▅▆▇█
train_char_accuracy,▁▅▆▇▇▇██
train_loss,█▃▂▂▂▂▁▁

0,1
dev_word_accuracy,0.01078
epoch,7.0
train_char_accuracy,0.38881
train_loss,1.8595


[34m[1mwandb[0m: Agent Starting Run: 6dlrfv2c with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 512
[34m[1mwandb[0m: 	dim_feedforward: 1024
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	lr: 0.00026237109157088695
[34m[1mwandb[0m: 	nhead: 16
[34m[1mwandb[0m: 	num_decoder_layers: 2
[34m[1mwandb[0m: 	num_encoder_layers: 2


                                                           

Epoch 1 | Train Loss: 1.9560 | Train Char Acc: 0.4012 | Dev Word Acc: 0.0021
 -> New best model saved to best_transformer_model_6dlrfv2c.pth with dev word accuracy: 0.0021


                                                           

Epoch 2 | Train Loss: 1.5340 | Train Char Acc: 0.4958 | Dev Word Acc: 0.0057
 -> New best model saved to best_transformer_model_6dlrfv2c.pth with dev word accuracy: 0.0057


                                                           

Epoch 3 | Train Loss: 1.4163 | Train Char Acc: 0.5292 | Dev Word Acc: 0.0089
 -> New best model saved to best_transformer_model_6dlrfv2c.pth with dev word accuracy: 0.0089


                                                           

Epoch 4 | Train Loss: 1.3438 | Train Char Acc: 0.5491 | Dev Word Acc: 0.0067


                                                           

Epoch 5 | Train Loss: 1.2897 | Train Char Acc: 0.5663 | Dev Word Acc: 0.0078


                                                           

Epoch 6 | Train Loss: 1.2462 | Train Char Acc: 0.5790 | Dev Word Acc: 0.0096
 -> New best model saved to best_transformer_model_6dlrfv2c.pth with dev word accuracy: 0.0096


                                                           

Epoch 7 | Train Loss: 1.2146 | Train Char Acc: 0.5894 | Dev Word Acc: 0.0106
 -> New best model saved to best_transformer_model_6dlrfv2c.pth with dev word accuracy: 0.0106


                                                           

Epoch 8 | Train Loss: 1.1851 | Train Char Acc: 0.5984 | Dev Word Acc: 0.0128
 -> New best model saved to best_transformer_model_6dlrfv2c.pth with dev word accuracy: 0.0128

Training complete. Loading best model for final evaluation on test set...


                                                               


--- Final Test Set Evaluation Results ---
Word-level Accuracy on Test Set: 0.0164


Generating Test Predictions:  36%|███▋      | 1641/4502 [01:10<02:01, 23.51it/s]

## For Test data

In [1]:
import numpy as np

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import math
import csv
from collections import namedtuple

# ---------------- Data Processing and Utilities ----------------

class TransliterationDataset(Dataset):
    """
    A PyTorch Dataset for transliteration data.
    """
    def __init__(self, pairs, input_vocab, output_vocab):
        self.pairs = pairs
        self.input_vocab = input_vocab
        self.output_vocab = output_vocab
        self.sos = output_vocab['<sos>']
        self.eos = output_vocab['<eos>']
        self.unk_in = input_vocab.get('<unk>', 1)
        self.unk_out = output_vocab.get('<unk>', 3)

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        source, target = self.pairs[idx]
        input_ids = [self.input_vocab.get(c, self.unk_in) for c in source]
        target_ids = [self.sos] + [self.output_vocab.get(c, self.unk_out) for c in target] + [self.eos]
        return torch.tensor(input_ids), torch.tensor(target_ids)

def build_vocab(pairs):
    """
    Builds character-level vocabularies from a list of (source, target) pairs.
    """
    input_chars = set()
    output_chars = set()
    for src, tgt in pairs:
        input_chars.update(src)
        output_chars.update(tgt)
    
    input_vocab = {c: i + 2 for i, c in enumerate(sorted(input_chars))}
    input_vocab['<pad>'] = 0
    input_vocab['<unk>'] = 1
    
    output_vocab = {c: i + 4 for i, c in enumerate(sorted(output_chars))}
    output_vocab.update({'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3})
    
    return input_vocab, output_vocab

def load_pairs(path):
    """
    Loads transliteration pairs from a TSV file.
    """
    df = pd.read_csv(path, sep='\t', header=None, names=['target', 'source', 'count'], dtype=str)
    df.dropna(subset=["source", "target"], inplace=True)
    return list(zip(df['source'], df['target']))

def collate_fn(batch):
    """
    Pads sequences in a batch to the same length.
    """
    inputs, targets = zip(*batch)
    inputs_padded = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)
    return inputs_padded, targets_padded

# ---------------- Transformer Specific Components ----------------

class PositionalEncoding(nn.Module):
    """
    Injects positional information into the input embeddings.
    """
    def __init__(self, d_model, dropout, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    """
    The main Transformer model for sequence-to-sequence transliteration.
    """
    def __init__(self, input_vocab_size, output_vocab_size, d_model, nhead, num_encoder_layers,
                 num_decoder_layers, dim_feedforward, dropout):
        super().__init__()
        
        self.d_model = d_model
        self.encoder_embedding = nn.Embedding(input_vocab_size, d_model, padding_idx=0)
        self.decoder_embedding = nn.Embedding(output_vocab_size, d_model, padding_idx=0)
        self.positional_encoding = PositionalEncoding(d_model, dropout)
        
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        
        self.fc_out = nn.Linear(d_model, output_vocab_size)
        self.output_vocab_size = output_vocab_size
        self.sos_idx = 1
        self.eos_idx = 2

    def forward(self, src, tgt, src_mask=None, tgt_mask=None, src_key_padding_mask=None, tgt_key_padding_mask=None):
        src_embedded = self.positional_encoding(self.encoder_embedding(src) * math.sqrt(self.d_model))
        tgt_embedded = self.positional_encoding(self.decoder_embedding(tgt) * math.sqrt(self.d_model))
        
        transformer_out = self.transformer(
            src_embedded, tgt_embedded,
            src_mask=src_mask,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )
        
        output = self.fc_out(transformer_out)
        return output

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def create_padding_mask(self, seq, pad_idx=0):
        return (seq == pad_idx)

# ---------------- Training and Evaluation Functions ----------------

def accuracy(preds, targets, pad_idx=0):
    """
    Calculates character-level accuracy, ignoring padding tokens.
    """
    pred_tokens = preds.argmax(dim=-1)
    correct = ((pred_tokens == targets) & (targets != pad_idx)).sum().item()
    total = (targets != pad_idx).sum().item()
    return correct / total if total > 0 else 0.0

@torch.no_grad()
def evaluate_and_sample(model, dataloader, device, input_vocab, output_vocab, num_samples=10):
    """
    Evaluates the model's word-level accuracy and returns a sample of predictions.
    """
    model.eval()
    correct_words = 0
    total_words = 0
    
    inv_input_vocab = {v: k for k, v in input_vocab.items()}
    inv_output_vocab = {v: k for k, v in output_vocab.items()}
    sample_predictions = []
    samples_collected = 0

    for src, tgt in tqdm(dataloader, desc="Evaluating", leave=False):
        src, tgt = src.to(device), tgt.to(device)

        src_padding_mask = model.create_padding_mask(src).to(device)
        batch_size = src.size(0)
        max_len = 20
        
        generated_tokens = torch.full((batch_size, 1), model.sos_idx, dtype=torch.long, device=device)
        
        for t in range(max_len):
            tgt_mask = model.generate_square_subsequent_mask(generated_tokens.size(1)).to(device)
            tgt_padding_mask = model.create_padding_mask(generated_tokens).to(device)
            
            output = model(src, generated_tokens, 
                           src_key_padding_mask=src_padding_mask, 
                           tgt_key_padding_mask=tgt_padding_mask, 
                           tgt_mask=tgt_mask)
            
            next_token = output[:, -1, :].argmax(dim=-1).unsqueeze(1)
            generated_tokens = torch.cat([generated_tokens, next_token], dim=1)
            
            if (next_token == model.eos_idx).all():
                break

        for i in range(batch_size):
            pred_seq = generated_tokens[i]
            target_seq = tgt[i]
            
            pred_end = (pred_seq == model.eos_idx).nonzero(as_tuple=True)[0]
            target_end = (target_seq == model.eos_idx).nonzero(as_tuple=True)[0]
            
            pred_word = pred_seq[1:pred_end[0] if pred_end.numel() > 0 else len(pred_seq)]
            target_word = target_seq[1:target_end[0] if target_end.numel() > 0 else len(target_seq)]

            if torch.equal(pred_word, target_word):
                correct_words += 1
            total_words += 1
            
            if samples_collected < num_samples:
                inp_str = ''.join([inv_input_vocab[t.item()] for t in src[i] if t.item() != 0])
                pred_str = ''.join([inv_output_vocab.get(t.item(), '<unk>') for t in pred_word if t.item() not in [model.sos_idx, model.eos_idx, 0]])
                truth_str = ''.join([inv_output_vocab.get(t.item(), '<unk>') for t in target_word if t.item() not in [model.sos_idx, model.eos_idx, 0]])
                sample_predictions.append((inp_str, pred_str, truth_str))
                samples_collected += 1
            
    return correct_words / total_words if total_words > 0 else 0.0, sample_predictions

def train_epoch(model, loader, optimizer, criterion, device):
    """
    Trains the model for one epoch.
    """
    model.train()
    total_loss, total_char_acc = 0, 0
    for src, tgt in tqdm(loader, desc="Training", leave=False):
        src, tgt = src.to(device), tgt.to(device)

        optimizer.zero_grad()
        
        src_padding_mask = model.create_padding_mask(src).to(device)
        tgt_input = tgt[:, :-1]
        
        tgt_padding_mask = model.create_padding_mask(tgt_input).to(device)
        
        tgt_output = tgt[:, 1:]

        tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).to(device)
        
        output = model(src, tgt_input, 
                       src_key_padding_mask=src_padding_mask, 
                       tgt_key_padding_mask=tgt_padding_mask, 
                       tgt_mask=tgt_mask)
        
        loss = criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))
        char_acc = accuracy(output, tgt_output)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_char_acc += char_acc
        
    return total_loss / len(loader), total_char_acc / len(loader)

def generate_predictions_csv(model, dataloader, input_vocab, output_vocab, device, csv_path):
    """
    Generates predictions for a test set and saves them to a CSV file.
    """
    model.eval()
    inv_input_vocab = {v: k for k, v in input_vocab.items()}
    inv_output_vocab = {v: k for k, v in output_vocab.items()}
    results = []

    with torch.no_grad():
        for src, tgt in tqdm(dataloader, desc="Generating Test Predictions"):
            src = src.to(device)
            batch_size = src.size(0)
            max_len = 20

            src_padding_mask = model.create_padding_mask(src).to(device)
            generated_tokens = torch.full((batch_size, 1), model.sos_idx, dtype=torch.long, device=device)
            
            for t in range(max_len):
                tgt_mask = model.generate_square_subsequent_mask(generated_tokens.size(1)).to(device)
                tgt_padding_mask = model.create_padding_mask(generated_tokens).to(device)
                
                output = model(src, generated_tokens, 
                               src_key_padding_mask=src_padding_mask, 
                               tgt_key_padding_mask=tgt_padding_mask, 
                               tgt_mask=tgt_mask)
                
                next_token = output[:, -1, :].argmax(dim=-1).unsqueeze(1)
                generated_tokens = torch.cat([generated_tokens, next_token], dim=1)
                
                if (next_token == model.eos_idx).all():
                    break

            for i in range(batch_size):
                pred_seq = generated_tokens[i]
                target_seq = tgt[i]
                
                pred_end = (pred_seq == model.eos_idx).nonzero(as_tuple=True)[0]
                target_end = (target_seq == model.eos_idx).nonzero(as_tuple=True)[0]
                
                pred_word_tokens = pred_seq[1:pred_end[0] if pred_end.numel() > 0 else len(pred_seq)]
                truth_word_tokens = target_seq[1:target_end[0] if target_end.numel() > 0 else len(target_seq)]

                pred_str = ''.join([inv_output_vocab.get(t.item(), '<unk>') for t in pred_word_tokens if t.item() not in [model.sos_idx, model.eos_idx, 0]])
                truth_str = ''.join([inv_output_vocab.get(t.item(), '<unk>') for t in truth_word_tokens if t.item() not in [model.sos_idx, model.eos_idx, 0]])
                inp_str = ''.join([inv_input_vocab.get(t.item(), '<unk>') for t in src[i] if t.item() != 0])
                results.append((inp_str, pred_str, truth_str))
    
    with open(csv_path, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Input', 'Prediction', 'GroundTruth'])
        writer.writerows(results)
    print(f"\nPredictions saved to: {csv_path}")

# ---------------- Main Function ----------------

def main():
    # Model Hyperparameters
    # These are fixed values for a simple run.
    # You can change them to explore different configurations.
    config = namedtuple("Config", [
        "d_model", "nhead", "num_encoder_layers", "num_decoder_layers",
        "dim_feedforward", "dropout", "lr", "batch_size", "num_epochs"
    ])(
        d_model=256,
        nhead=4,
        num_encoder_layers=4,
        num_decoder_layers=2,
        dim_feedforward=1024,
        dropout=0.1,
        lr=0.0005,
        batch_size=32,
        num_epochs=10
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load data
    train_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv")
    dev_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv")
    test_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv")

    # Build vocab on train + dev pairs for consistency
    input_vocab, output_vocab = build_vocab(train_pairs + dev_pairs)
    train_dataset = TransliterationDataset(train_pairs, input_vocab, output_vocab)
    dev_dataset = TransliterationDataset(dev_pairs, input_vocab, output_vocab)
    test_dataset = TransliterationDataset(test_pairs, input_vocab, output_vocab)

    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

    model = TransformerModel(
        input_vocab_size=len(input_vocab),
        output_vocab_size=len(output_vocab),
        d_model=config.d_model,
        nhead=config.nhead,
        num_encoder_layers=config.num_encoder_layers,
        num_decoder_layers=config.num_decoder_layers,
        dim_feedforward=config.dim_feedforward,
        dropout=config.dropout
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, betas=(0.9, 0.98), eps=1e-9)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    best_dev_acc = 0
    best_model_path = 'best_transformer_model.pth'

    # Training loop
    for epoch in range(config.num_epochs):
        train_loss, _ = train_epoch(model, train_loader, optimizer, criterion, device)
        dev_word_acc, dev_samples = evaluate_and_sample(model, dev_loader, device, input_vocab, output_vocab, num_samples=10)
        
        print(f"\nEpoch {epoch+1} Train Loss: {train_loss:.4f}\n")
        print(f" Test Accuracy: {dev_word_acc:.2%}")
        for inp, pred, truth in dev_samples:
            print(f"{inp:<15}| Pred: {pred:<20}| Truth: {truth}")
        
        if dev_word_acc > best_dev_acc:
            best_dev_acc = dev_word_acc
            torch.save(model.state_dict(), best_model_path)
            print(f"\n -> New best model saved to {best_model_path} with dev word accuracy: {best_dev_acc:.4f}")

    print("\n Loading best model for final evaluation...")
    
    # Load the best model found during this run
    try:
        model.load_state_dict(torch.load(best_model_path))
    except FileNotFoundError:
        print("Error: Best model checkpoint not found. Using the last trained model.")
        
    final_test_word_acc, test_samples = evaluate_and_sample(model, test_loader, device, input_vocab, output_vocab, num_samples=10)
    print(f"\n Final Test Accuracy: {final_test_word_acc:.2%}")
    for inp, pred, truth in test_samples:
        print(f"{inp:<15}| Pred: {pred:<20}| Truth: {truth}")
    
    generate_predictions_csv(model, test_loader, input_vocab, output_vocab, device, csv_path="test_predictions.csv")

if __name__ == "__main__":
    main()

Using device: cuda


  output = torch._nested_tensor_from_mask(
                                                             


Epoch 1 Train Loss: 1.6923

 Test Accuracy: 0.80%
ankan          | Pred: काना                | Truth: अंकन
angkor         | Pred: कार्णग              | Truth: अंगकोर
angira         | Pred: अंग्री              | Truth: अंगिरा
angithi        | Pred: इंग्थिया            | Truth: अंगीठी
angrej         | Pred: जरेंग               | Truth: अंग्रेज
angrejon       | Pred: जारों               | Truth: अंग्रेजों
anjaam         | Pred: मजना                | Truth: अंजाम
anjam          | Pred: मजना                | Truth: अंजाम
antakaran      | Pred: नात्रक              | Truth: अंतकरण
antkaran       | Pred: नात्रक              | Truth: अंतकरण

 -> New best model saved to best_transformer_model.pth with dev word accuracy: 0.0080


                                                             


Epoch 2 Train Loss: 1.3372

 Test Accuracy: 1.49%
ankan          | Pred: कानार्ण             | Truth: अंकन
angkor         | Pred: कर्गों              | Truth: अंगकोर
angira         | Pred: गरणिर्णा            | Truth: अंगिरा
angithi        | Pred: गित्नि              | Truth: अंगीठी
angrej         | Pred: जार्गंजी            | Truth: अंग्रेज
angrejon       | Pred: जानोर्गं            | Truth: अंग्रेजों
anjaam         | Pred: माजन                | Truth: अंजाम
anjam          | Pred: मजान                | Truth: अंजाम
antakaran      | Pred: नतर्कतारण           | Truth: अंतकरण
antkaran       | Pred: नात्रकरण            | Truth: अंतकरण

 -> New best model saved to best_transformer_model.pth with dev word accuracy: 0.0149


                                                             


Epoch 3 Train Loss: 1.2232

 Test Accuracy: 1.65%
ankan          | Pred: कान्नायक            | Truth: अंकन
angkor         | Pred: कार्गणोंकर          | Truth: अंगकोर
angira         | Pred: निग्रारण            | Truth: अंगिरा
angithi        | Pred: निघट                | Truth: अंगीठी
angrej         | Pred: राजनगीर             | Truth: अंग्रेज
angrejon       | Pred: रांगजन              | Truth: अंग्रेजों
anjaam         | Pred: जमन्मान             | Truth: अंजाम
anjam          | Pred: जमन्मान             | Truth: अंजाम
antakaran      | Pred: नाकार्तानक          | Truth: अंतकरण
antkaran       | Pred: नाक्रतान            | Truth: अंतकरण

 -> New best model saved to best_transformer_model.pth with dev word accuracy: 0.0165


Training:  19%|█▉        | 264/1382 [00:06<00:24, 46.34it/s]