In [1]:
import wandb
wandb.login(key="fb4c8007ed0d1fb692b2279b11bb69081f2c698d")

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mma23c014[0m ([33mma23c014-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [3]:
# Repro 
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# =============== Data ====================
class TransliterationDataset(Dataset):
    def __init__(self, pairs, input_vocab, output_vocab):
        self.pairs = pairs
        self.input_vocab = input_vocab
        self.output_vocab = output_vocab
        self.sos = output_vocab['<sos>']
        self.eos = output_vocab['<eos>']

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        source, target = self.pairs[idx]
        # map OOV chars to <pad>=0 silently
        input_ids = [self.input_vocab.get(c, 0) for c in source]
        target_ids = [self.sos] + [self.output_vocab.get(c, 0) for c in target] + [self.eos]
        return torch.tensor(input_ids, dtype=torch.long), torch.tensor(target_ids, dtype=torch.long)

def build_vocab(pairs):
    input_chars = set()
    output_chars = set()
    for source, target in pairs:
        input_chars.update(list(source))
        output_chars.update(list(target))
    # 0: <pad>
    input_vocab = {c: i + 1 for i, c in enumerate(sorted(input_chars))}
    input_vocab['<pad>'] = 0
    # 0:<pad> 1:<sos> 2:<eos>
    output_vocab = {c: i + 3 for i, c in enumerate(sorted(output_chars))}
    output_vocab.update({'<pad>': 0, '<sos>': 1, '<eos>': 2})
    return input_vocab, output_vocab

def invert_vocab(v):
    return {i: c for c, i in v.items()}

def load_pairs(path):
    # Dakshina TSV: target \t source \t count
    df = pd.read_csv(path, sep="\t", header=None, names=["target", "source", "count"], dtype=str)
    df.dropna(subset=["source", "target"], inplace=True)
    # Strip whitespace just in case
    df["source"] = df["source"].astype(str).str.strip()
    df["target"] = df["target"].astype(str).str.strip()
    return list(zip(df["source"], df["target"]))

def collate_fn(batch):
    inputs, targets = zip(*batch)
    input_lens = [len(seq) for seq in inputs]
    target_lens = [len(seq) for seq in targets]
    inputs_padded = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)
    return inputs_padded, targets_padded, input_lens, target_lens

# =============== Models ==================
class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(input_size, embed_size, padding_idx=0)
        rnn_class = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[cell_type]
        self.rnn = rnn_class(
            embed_size, hidden_size, num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )

    def forward(self, x, lengths):
        x = self.embedding(x)  # (B, T, E)
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        # We only need the final hidden(s) to initialize decoder
        return hidden  # GRU/RNN: (num_layers, B, H). LSTM: tuple((num_layers, B, H), (num_layers, B, H))

class Decoder(nn.Module):
    def __init__(self, output_size, embed_size, hidden_size, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(output_size, embed_size, padding_idx=0)
        rnn_class = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[cell_type]
        self.rnn = rnn_class(
            embed_size, hidden_size, num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input_token, hidden):
        """
        input_token: (B,) longs
        hidden: same type/shape as encoder hidden
        returns: logits (B, V), new_hidden
        """
        x = self.embedding(input_token).unsqueeze(1)  # (B, 1, E)
        output, hidden = self.rnn(x, hidden)          # output: (B, 1, H)
        logits = self.fc(output.squeeze(1))           # (B, V)
        return logits, hidden

    @torch.no_grad()
    def beam_search(self, hidden, max_len, sos_idx, eos_idx, beam_size=3):
        """
        Non-batched beam search (runs per sample). Handles LSTM/GRU hidden.
        hidden is either:
          - Tensor (num_layers, 1, H) OR
          - Tuple(h, c) with each (num_layers, 1, H)
        """
        device = next(self.parameters()).device

        def clone_hidden(h):
            if isinstance(h, tuple):
                return (h[0].clone(), h[1].clone())
            else:
                return h.clone()

        # Each item: (seq[LongTensor], hidden, log_prob)
        start_seq = torch.tensor([sos_idx], device=device, dtype=torch.long)
        sequences = [(start_seq, clone_hidden(hidden), 0.0)]
        completed = []

        for _ in range(max_len):
            new_sequences = []
            for seq, h, score in sequences:
                last_token = seq[-1].view(1)  # (1,)
                logits, new_h = self.forward(last_token, h)
                log_probs = torch.log_softmax(logits, dim=-1).squeeze(0)  # (V,)
                topk_logp, topk_idx = torch.topk(log_probs, beam_size)

                for lp, idx in zip(topk_logp, topk_idx):
                    idx = idx.item()
                    new_seq = torch.cat([seq, torch.tensor([idx], device=device)])
                    new_score = score + lp.item()
                    new_sequences.append((new_seq, clone_hidden(new_h), new_score))

            # Keep top-k
            new_sequences.sort(key=lambda x: x[2], reverse=True)
            sequences = new_sequences[:beam_size]

            # Move completed to list
            still_running = []
            for seq, h, score in sequences:
                if seq[-1].item() == eos_idx:
                    completed.append((seq, h, score))
                else:
                    still_running.append((seq, h, score))
            sequences = still_running
            if not sequences:
                break

        if not completed:
            completed = sequences
        completed.sort(key=lambda x: x[2], reverse=True)
        return completed[0][0]  # best seq

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, sos_idx=1, eos_idx=2, max_len=40):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.sos_idx = sos_idx
        self.eos_idx = eos_idx
        self.max_len = max_len

    def forward(self, src, src_lens, tgt=None, teacher_forcing_ratio=0.5):
        """
        If tgt is provided: training mode (returns logits over time).
        Else: returns list of token sequences (beam search per sample).
        """
        batch_size = src.size(0)
        device = src.device

        # Encode
        hidden = self.encoder(src, src_lens)

        if tgt is not None:
            tgt_len = tgt.size(1)
            vocab_size = self.decoder.fc.out_features
            outputs = torch.zeros(batch_size, tgt_len, vocab_size, device=device)

            input_token = tgt[:, 0]  # <sos>
            dec_hidden = hidden

            for t in range(1, tgt_len):
                logits, dec_hidden = self.decoder(input_token, dec_hidden)
                outputs[:, t] = logits
                teacher_force = torch.rand(1, device=device).item() < teacher_forcing_ratio
                next_token = tgt[:, t] if teacher_force else torch.argmax(logits, dim=-1)
                input_token = next_token
            return outputs
        else:
            # Inference: beam search per example with batch_size=1 hidden slices
            sequences = []
            # Split hidden for each item in batch
            for b in range(batch_size):
                if isinstance(hidden, tuple):
                    h_b = tuple(h[:, b:b+1, :].contiguous() for h in hidden)
                else:
                    h_b = hidden[:, b:b+1, :].contiguous()
                seq = self.decoder.beam_search(
                    h_b, max_len=self.max_len, sos_idx=self.sos_idx, eos_idx=self.eos_idx, beam_size=3
                )
                sequences.append(seq)
            return sequences

# =============== Metrics =================
def char_accuracy(logits, targets, pad_idx=0):
    """
    logits: (B, T, V), targets: (B, T)
    """
    with torch.no_grad():
        preds = logits.argmax(dim=-1)
        mask = (targets != pad_idx)
        correct = ((preds == targets) & mask).sum().item()
        total = mask.sum().item()
        return (correct / total) if total > 0 else 0.0

def levenshtein(a, b):
    """
    Simple DP Levenshtein distance between two strings.
    """
    n, m = len(a), len(b)
    if n == 0: return m
    if m == 0: return n
    dp = [[0]*(m+1) for _ in range(n+1)]
    for i in range(n+1): dp[i][0] = i
    for j in range(m+1): dp[0][j] = j
    for i in range(1, n+1):
        for j in range(1, m+1):
            cost = 0 if a[i-1] == b[j-1] else 1
            dp[i][j] = min(
                dp[i-1][j] + 1,      # delete
                dp[i][j-1] + 1,      # insert
                dp[i-1][j-1] + cost  # substitute
            )
    return dp[n][m]

def decode_greedy(model, src, src_lens, output_ivocab, max_len=40, sos_idx=1, eos_idx=2):
    """
    Greedy decoding for batch (faster than beam for eval metrics).
    Returns list of strings.
    """
    model.eval()
    device = src.device
    batch_size = src.size(0)

    # Encode
    hidden = model.encoder(src, src_lens)

    # Initialize
    input_token = torch.full((batch_size,), sos_idx, dtype=torch.long, device=device)
    dec_hidden = hidden
    outputs = [[] for _ in range(batch_size)]

    for _ in range(max_len):
        logits, dec_hidden = model.decoder(input_token, dec_hidden)  # (B, V)
        next_token = torch.argmax(logits, dim=-1)                    # (B,)
        for b in range(batch_size):
            outputs[b].append(next_token[b].item())
        input_token = next_token

    # Convert ids to strings, stopping at eos
    decoded = []
    for seq in outputs:
        chars = []
        for tok in seq:
            if tok == eos_idx:
                break
            if tok in output_ivocab:
                ch = output_ivocab[tok]
                if ch not in ['<pad>', '<sos>', '<eos>']:
                    chars.append(ch)
        decoded.append(''.join(chars))
    return decoded

def batch_word_accuracy_and_cer(pred_strs, tgt_strs):
    """
    pred_strs, tgt_strs: lists of strings length B
    Returns (word_acc, cer)
    """
    assert len(pred_strs) == len(tgt_strs)
    exact = 0
    total_char_err = 0
    total_char = 0
    for p, t in zip(pred_strs, tgt_strs):
        if p == t:
            exact += 1
        dist = levenshtein(p, t)
        total_char_err += dist
        total_char += max(len(t), 1)
    word_acc = exact / len(pred_strs) if pred_strs else 0.0
    cer = total_char_err / total_char if total_char > 0 else 0.0
    return word_acc, cer

# =============== Train/Eval ==============
def train_one_epoch(model, loader, optimizer, criterion, device, clip_norm=5.0, teacher_forcing_ratio=0.5):
    model.train()
    total_loss, total_acc = 0.0, 0.0
    for src, tgt, src_lens, tgt_lens in tqdm(loader, desc="Training", leave=False):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        logits = model(src, src_lens, tgt, teacher_forcing_ratio=teacher_forcing_ratio)  # (B, T, V)
        # shift to ignore first token (<sos> position 0)
        loss = criterion(logits[:, 1:].reshape(-1, logits.size(-1)), tgt[:, 1:].reshape(-1))
        acc = char_accuracy(logits[:, 1:], tgt[:, 1:])
        loss.backward()
        if clip_norm is not None:
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_norm)
        optimizer.step()
        total_loss += loss.item()
        total_acc += acc
    n = len(loader)
    return total_loss / n, total_acc / n

@torch.no_grad()
def evaluate(model, loader, criterion, device, output_ivocab, eos_idx=2, sos_idx=1, max_len=40):
    model.eval()
    total_loss, total_char_acc = 0.0, 0.0
    all_pred, all_gold = [], []
    for src, tgt, src_lens, tgt_lens in tqdm(loader, desc="Evaluating", leave=False):
        src, tgt = src.to(device), tgt.to(device)

        # Teacher forcing OFF for loss/char-acc to mimic inference distribution
        logits = model(src, src_lens, tgt, teacher_forcing_ratio=0.0)
        loss = criterion(logits[:, 1:].reshape(-1, logits.size(-1)), tgt[:, 1:].reshape(-1))
        acc = char_accuracy(logits[:, 1:], tgt[:, 1:])
        total_loss += loss.item()
        total_char_acc += acc

        # Word-level metrics via greedy decode
        batch_pred = decode_greedy(model, src, src_lens, output_ivocab, max_len=max_len, sos_idx=sos_idx, eos_idx=eos_idx)

        # Convert gold to string for the same batch
        gold_strs = []
        for seq in tgt.cpu().numpy():
            chars = []
            for tok in seq[1:]:  # skip <sos>
                if tok == eos_idx or tok == 0:
                    break
                ch = output_ivocab.get(int(tok), '')
                if ch not in ['<pad>', '<sos>', '<eos>']:
                    chars.append(ch)
            gold_strs.append(''.join(chars))

        all_pred.extend(batch_pred)
        all_gold.extend(gold_strs)

    n = len(loader)
    avg_loss = total_loss / n if n > 0 else 0.0
    avg_char_acc = total_char_acc / n if n > 0 else 0.0
    word_acc, cer = batch_word_accuracy_and_cer(all_pred, all_gold)
    return avg_loss, avg_char_acc, word_acc, cer

# =============== Main / W&B =============
def main():
    # This function is called by wandb.agent in a sweep
    config = wandb.config

    # Give each run a readable name
    run_name = f"cell:{config.cell_type}_emb:{config.embed_size}_hid:{config.hidden_size}_L:{config.num_layers}_bs:{config.batch_size}_lr:{config.lr}"
    if hasattr(wandb.run, "name") and (wandb.run.name is None or wandb.run.name == ""):
        wandb.run.name = run_name

    set_seed(42)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # ---- Data ----
    train_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv")
    dev_pairs   = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv")

    input_vocab, output_vocab = build_vocab(train_pairs)
    output_ivocab = invert_vocab(output_vocab)

    train_dataset = TransliterationDataset(train_pairs, input_vocab, output_vocab)
    dev_dataset   = TransliterationDataset(dev_pairs,   input_vocab, output_vocab)

    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True,  collate_fn=collate_fn)
    dev_loader   = DataLoader(dev_dataset,   batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn)

    # ---- Model ----
    encoder = Encoder(
        input_size=len(input_vocab),
        embed_size=config.embed_size,
        hidden_size=config.hidden_size,
        num_layers=config.num_layers,
        cell_type=config.cell_type,
        dropout=config.dropout
    )
    decoder = Decoder(
        output_size=len(output_vocab),
        embed_size=config.embed_size,
        hidden_size=config.hidden_size,
        num_layers=config.num_layers,
        cell_type=config.cell_type,
        dropout=config.dropout
    )
    model = Seq2Seq(
        encoder=encoder,
        decoder=decoder,
        sos_idx=output_vocab['<sos>'],
        eos_idx=output_vocab['<eos>'],
        max_len=40
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    best_val_word_acc = -1.0

    for epoch in range(config.epochs):
        train_loss, train_char_acc = train_one_epoch(
            model, train_loader, optimizer, criterion, device,
            clip_norm=5.0,
            teacher_forcing_ratio=getattr(config, "teacher_forcing", 0.5)
        )
        val_loss, val_char_acc, val_word_acc, val_cer = evaluate(
            model, dev_loader, criterion, device, output_ivocab,
            eos_idx=output_vocab['<eos>'], sos_idx=output_vocab['<sos>'], max_len=40
        )

        wandb.log({
            "epoch": epoch,
            "train_loss": train_loss,
            "train_char_accuracy": train_char_acc,
            "val_loss": val_loss,
            "val_char_accuracy": val_char_acc,
            "val_word_accuracy": val_word_acc,
            "val_CER": val_cer
        })

        # Save best on word accuracy
        if val_word_acc > best_val_word_acc:
            best_val_word_acc = val_word_acc
            save_path = os.path.join(wandb.run.dir, "best_model.pt")
            torch.save({
                "model_state": model.state_dict(),
                "config": dict(config),
                "input_vocab": input_vocab,
                "output_vocab": output_vocab
            }, save_path)
            wandb.log({"best_model_path": save_path, "best_val_word_accuracy": best_val_word_acc})

# =============== Entry ================
if __name__ == "__main__":
    # Define sweep if running this file directly.
    sweep_config = {
        "method": "bayes",
        "metric": {"name": "val_word_accuracy", "goal": "maximize"},
        "parameters": {
            "embed_size":  {"values": [64, 128]},
            "hidden_size": {"values": [128, 256]},
            "num_layers":  {"values": [1, 2]},
            "cell_type":   {"values": ["GRU", "LSTM"]},
            "dropout":     {"values": [0.1, 0.2, 0.3]},
            "lr":          {"min": 1e-4, "max": 5e-3},
            "batch_size":  {"values": [32, 64]},
            "epochs":      {"values": [8]},                  
            "teacher_forcing": {"values": [0.5, 0.6, 0.7]},
        }
    }

    # Initialize sweep
    sweep_id = wandb.sweep(sweep_config, project="Dakshina-Transliteration")

    # The function given here must call wandb.init() internally; we do that implicitly when wandb.agent starts a run.
    # We'll initialize the run at the top of main via wandb.config access.
    def sweep_main():
        # Make sure a run is created and config is readable
        wandb.init(project="Dakshina-Transliteration")
        main()
        wandb.finish()

    # Launch N runs (adjust count as you like)
    wandb.agent(sweep_id, function=sweep_main, count=8)

Create sweep with ID: r72d84fg
Sweep URL: https://wandb.ai/ma23c014-indian-institute-of-technology-madras/dakshina-transliteration/sweeps/r72d84fg


[34m[1mwandb[0m: Agent Starting Run: vfoo6bub with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	lr: 0.0003808171759622872
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	teacher_forcing: 0.6


                                                           

0,1
best_val_word_accuracy,▁▄▆▇▇▇██
epoch,▁▂▃▄▅▆▇█
train_char_accuracy,▁▅▆▇▇███
train_loss,█▄▃▂▂▁▁▁
val_CER,█▄▃▂▂▂▁▁
val_char_accuracy,▁▅▆▇▇▇██
val_loss,█▄▂▂▁▁▁▁
val_word_accuracy,▁▄▆▇▇▇██

0,1
best_model_path,/kaggle/working/wand...
best_val_word_accuracy,0.36508
epoch,7
train_char_accuracy,0.85783
train_loss,0.48068
val_CER,0.19339
val_char_accuracy,0.71221
val_loss,1.10285
val_word_accuracy,0.36508


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: t8o82lp7 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	lr: 0.004006031483882168
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	teacher_forcing: 0.7


                                                             

0,1
best_val_word_accuracy,▁▄▆▇█
epoch,▁▂▃▄▅▆▇█
train_char_accuracy,▁▆▇▇▇███
train_loss,█▃▂▂▁▁▁▁
val_CER,█▄▂▂▁▁▁▁
val_char_accuracy,▁▅▇▇█▇██
val_loss,█▁▂▄▂▆▄▃
val_word_accuracy,▁▄▆▇█▇█▇

0,1
best_model_path,/kaggle/working/wand...
best_val_word_accuracy,0.34901
epoch,7
train_char_accuracy,0.85539
train_loss,0.47978
val_CER,0.21462
val_char_accuracy,0.69458
val_loss,1.27966
val_word_accuracy,0.33846


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 7yjwxans with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	lr: 0.0040177054683746575
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	teacher_forcing: 0.7


                                                             

0,1
best_val_word_accuracy,▁▅█
epoch,▁▂▃▄▅▆▇█
train_char_accuracy,▁▇██████
train_loss,█▂▂▁▁▁▁▁
val_CER,█▃▂▁▃▂▄▃
val_char_accuracy,▁██▇▆▆▇▆
val_loss,▂▁▄▆▇▇▆█
val_word_accuracy,▁▅▄█▃▄▃▅

0,1
best_model_path,/kaggle/working/wand...
best_val_word_accuracy,0.23612
epoch,7
train_char_accuracy,0.78217
train_loss,0.72892
val_CER,0.29513
val_char_accuracy,0.61144
val_loss,1.57366
val_word_accuracy,0.22602


[34m[1mwandb[0m: Agent Starting Run: ai2v3o7k with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.00343904249198481
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	teacher_forcing: 0.6


                                                             

0,1
best_val_word_accuracy,▁▅▆▆▇██
epoch,▁▂▃▄▅▆▇█
train_char_accuracy,▁▅▆▇▇███
train_loss,█▄▂▂▂▁▁▁
val_CER,█▄▃▃▁▁▁▁
val_char_accuracy,▁▄▅▆▇▇██
val_loss,█▃▅▄▂▃▁▂
val_word_accuracy,▁▅▆▆▇█▇█

0,1
best_model_path,/kaggle/working/wand...
best_val_word_accuracy,0.2726
epoch,7
train_char_accuracy,0.80944
train_loss,0.6347
val_CER,0.26142
val_char_accuracy,0.65155
val_loss,1.32382
val_word_accuracy,0.2726


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: a0euzo9m with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_size: 128
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.004647932765344562
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	teacher_forcing: 0.6


                                                           

0,1
best_val_word_accuracy,▁▅▆▇▇█
epoch,▁▂▃▄▅▆▇█
train_char_accuracy,▁▆▇▇████
train_loss,█▃▂▂▁▁▁▁
val_CER,█▃▂▂▂▁▁▁
val_char_accuracy,▁▅▇▇▇▇▇█
val_loss,██▃▃▃▂▂▁
val_word_accuracy,▁▅▆▇▇▇▇█

0,1
best_model_path,/kaggle/working/wand...
best_val_word_accuracy,0.26801
epoch,7
train_char_accuracy,0.75671
train_loss,0.80208
val_CER,0.25849
val_char_accuracy,0.64901
val_loss,1.28939
val_word_accuracy,0.26801


[34m[1mwandb[0m: Agent Starting Run: nflv14df with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	lr: 0.0017376366776770415
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	teacher_forcing: 0.5


                                                             

0,1
best_val_word_accuracy,▁▅▆▇▇█
epoch,▁▂▃▄▅▆▇█
train_char_accuracy,▁▆▇▇▇███
train_loss,█▃▂▂▁▁▁▁
val_CER,█▃▂▂▃▂▁▂
val_char_accuracy,▁▄▆▇▆▇█▇
val_loss,▆▁▂▂▃▅▅█
val_word_accuracy,▁▅▆▇▇▇█▇

0,1
best_model_path,/kaggle/working/wand...
best_val_word_accuracy,0.32859
epoch,7
train_char_accuracy,0.86687
train_loss,0.44526
val_CER,0.21435
val_char_accuracy,0.69214
val_loss,1.22419
val_word_accuracy,0.31643


[34m[1mwandb[0m: Agent Starting Run: dt051nwy with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_size: 128
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.004869115139146405
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	teacher_forcing: 0.7


                                                             

0,1
best_val_word_accuracy,▁▄▆▇█
epoch,▁▂▃▄▅▆▇█
train_char_accuracy,▁▇▇█████
train_loss,█▂▂▁▁▁▁▁
val_CER,█▄▂▂▂▁▁▁
val_char_accuracy,▁▅▆▇▇▇▇█
val_loss,█▆▄▄▃▃▂▁
val_word_accuracy,▁▄▆▇▆▇██

0,1
best_model_path,/kaggle/working/wand...
best_val_word_accuracy,0.22625
epoch,7
train_char_accuracy,0.72255
train_loss,0.91394
val_CER,0.29833
val_char_accuracy,0.61142
val_loss,1.46013
val_word_accuracy,0.22327


[34m[1mwandb[0m: Agent Starting Run: 4266uag1 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_size: 128
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	lr: 0.000820041416310915
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	teacher_forcing: 0.6


                                                             

0,1
best_val_word_accuracy,▁▅▇▇█
epoch,▁▂▃▄▅▆▇█
train_char_accuracy,▁▅▆▇▇███
train_loss,█▄▃▂▂▁▁▁
val_CER,█▄▂▂▁▁▁▁
val_char_accuracy,▁▅▇▇████
val_loss,█▂▁▁▁▂▆▆
val_word_accuracy,▁▅▇▇██▇█

0,1
best_model_path,/kaggle/working/wand...
best_val_word_accuracy,0.37907
epoch,7
train_char_accuracy,0.92017
train_loss,0.27098
val_CER,0.192
val_char_accuracy,0.71744
val_loss,1.22975
val_word_accuracy,0.37219


## Test

In [5]:
class TransliterationDataset(Dataset):
    def __init__(self, pairs, input_vocab, output_vocab):
        self.pairs = pairs
        self.input_vocab = input_vocab
        self.output_vocab = output_vocab
        self.sos = output_vocab['<sos>']
        self.eos = output_vocab['<eos>']

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        source, target = self.pairs[idx]
        input_ids = [self.input_vocab[c] for c in source]
        target_ids = [self.sos] + [self.output_vocab[c] for c in target] + [self.eos]
        return torch.tensor(input_ids), torch.tensor(target_ids)

def load_pairs(path):
    df = pd.read_csv(path, sep='\t', header=None, names=['target', 'source', 'count'], dtype=str)
    df.dropna(subset=["source", "target"], inplace=True)
    return list(zip(df['source'], df['target']))

def build_vocab(pairs):
    input_chars = set()
    output_chars = set()
    for src, tgt in pairs:
        input_chars.update(src)
        output_chars.update(tgt)
    input_vocab = {c: i+1 for i, c in enumerate(sorted(input_chars))}
    input_vocab['<pad>'] = 0
    output_vocab = {c: i+3 for i, c in enumerate(sorted(output_chars))}
    output_vocab.update({'<pad>': 0, '<sos>': 1, '<eos>': 2})
    return input_vocab, output_vocab

def collate_fn(batch):
    inputs, targets = zip(*batch)
    input_lens = [len(x) for x in inputs]
    target_lens = [len(x) for x in targets]
    inputs_padded = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)
    return inputs_padded, targets_padded, input_lens, target_lens

# ---------------- Models ----------------
class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_layers, cell_type, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_size, embed_size, padding_idx=0)
        rnn_cls = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[cell_type]
        self.rnn = rnn_cls(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_size, embed_size, hidden_size, num_layers, cell_type, dropout):
        super().__init__()
        self.embedding = nn.Embedding(output_size, embed_size, padding_idx=0)
        rnn_cls = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[cell_type]
        self.rnn = rnn_cls(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, token, hidden):
        x = self.embedding(token.unsqueeze(1))
        output, hidden = self.rnn(x, hidden)
        output = self.fc(output.squeeze(1))
        return output, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, src_lens, tgt=None, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        hidden = self.encoder(src, src_lens)
        tgt_len = tgt.size(1)
        outputs = torch.zeros(batch_size, tgt_len, self.decoder.fc.out_features).to(src.device)
        input_token = tgt[:, 0]
        for t in range(1, tgt_len):
            output, hidden = self.decoder(input_token, hidden)
            outputs[:, t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            input_token = tgt[:, t] if teacher_force else output.argmax(1)
        return outputs

# ---------------- Train + Eval ----------------
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src, tgt, src_lens, _ in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, src_lens, tgt)
        loss = criterion(output[:, 1:].reshape(-1, output.shape[-1]), tgt[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_and_save(model, dataloader, input_vocab, output_vocab, device, csv_path=None):
    model.eval()
    inv_input_vocab = {v: k for k, v in input_vocab.items()}
    inv_output_vocab = {v: k for k, v in output_vocab.items()}
    correct = 0
    total = 0
    results = []

    with torch.no_grad():
        for src, tgt, src_lens, _ in dataloader:
            src = src.to(device)
            hidden = model.encoder(src, src_lens)
            input_token = torch.tensor([output_vocab['<sos>']] * src.size(0)).to(device)
            decoded = []
            for _ in range(20):
                output, hidden = model.decoder(input_token, hidden)
                input_token = output.argmax(1)
                decoded.append(input_token)
            decoded = torch.stack(decoded, dim=1)

            for i in range(src.size(0)):
                pred = ''.join([inv_output_vocab[t.item()] for t in decoded[i] if t.item() not in [output_vocab['<eos>'], 0]])
                truth = ''.join([inv_output_vocab[t.item()] for t in tgt[i][1:-1]])
                inp = ''.join([inv_input_vocab[t.item()] for t in src[i] if t.item() != 0])
                results.append((inp, pred, truth))
                if pred == truth:
                    correct += 1
                total += 1

    acc = correct / total * 100
    print(f"\n Test Accuracy: {acc:.2f}%")
    for inp, pred, truth in results[:10]:
        print(f"{inp:<15} | Pred: {pred:<20} | Truth: {truth}")

    if csv_path is not None:
        with open(csv_path, mode='w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['Input', 'Prediction', 'GroundTruth'])
            writer.writerows(results)
        print(f"\n Predictions saved to: {csv_path}")

    return acc, results


# ------------ Run ----------------
if __name__ == "__main__":
    config = {
        "embed_size": 128,
        "hidden_size": 256,
        "num_layers": 3,
        "cell_type": "LSTM",
        "dropout": 0.2,
        "batch_size": 32,
        "lr": 0.00082004,
        "epochs": 10,
    }


    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv")
    test_pairs = load_pairs("/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv")
    input_vocab, output_vocab = build_vocab(train_pairs)
    train_dataset = TransliterationDataset(train_pairs, input_vocab, output_vocab)
    test_dataset = TransliterationDataset(test_pairs, input_vocab, output_vocab)

    train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

    encoder = Encoder(len(input_vocab), config["embed_size"], config["hidden_size"],
                      config["num_layers"], config["cell_type"], config["dropout"])
    decoder = Decoder(len(output_vocab), config["embed_size"], config["hidden_size"],
                      config["num_layers"], config["cell_type"], config["dropout"])
    model = Seq2Seq(encoder, decoder).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    best_acc = 0
    for epoch in range(config["epochs"]):
        train_loss = train_model(model, train_loader, optimizer, criterion, device)
        print(f"Epoch {epoch+1} Train Loss: {train_loss:.4f}")
        acc, results = evaluate_and_save(model, test_loader, input_vocab, output_vocab, device, csv_path=None)
        if acc > best_acc:
            best_acc = acc
            torch.save(model.state_dict(), "best_model.pth")

    print("\n Loading best model for final evaluation...")
    model.load_state_dict(torch.load("best_model.pth"))

    # Save predictions CSV here
    evaluate_and_save(model, test_loader, input_vocab, output_vocab, device, csv_path="test_predictions.csv")

Epoch 1 Train Loss: 1.7050

 Test Accuracy: 22.57%
ank             | Pred: एंक                  | Truth: अंक
anka            | Pred: अंका                 | Truth: अंक
ankit           | Pred: अंकित                | Truth: अंकित
anakon          | Pred: अनकों                | Truth: अंकों
ankhon          | Pred: अंखों                | Truth: अंकों
ankon           | Pred: एंकों                | Truth: अंकों
angkor          | Pred: अंगकोर               | Truth: अंकोर
ankor           | Pred: एंकोर                | Truth: अंकोर
angaarak        | Pred: अंगाकर               | Truth: अंगारक
angarak         | Pred: अंगरक                | Truth: अंगारक
Epoch 2 Train Loss: 0.8184

 Test Accuracy: 29.05%
ank             | Pred: अंक                  | Truth: अंक
anka            | Pred: अंका                 | Truth: अंक
ankit           | Pred: अंकित                | Truth: अंकित
anakon          | Pred: अनाकों               | Truth: अंकों
ankhon          | Pred: अंखों                | Truth: अंकों
anko

___
___

#  **$$Transformer-Model$$**

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import math
import random
import wandb

In [4]:
# Dataset
class DakshinaDataset(Dataset):
    def __init__(self, file_path, inp_vocab=None, tgt_vocab=None, build_vocab=False):
        self.pairs = []
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split("\t")
                if len(parts) >= 2:
                    self.pairs.append((parts[0], parts[1]))  # (input_word, output_word)

        if len(self.pairs) == 0:
            raise ValueError(f"No valid data found in file: {file_path}")

        if build_vocab:
            self.inp_vocab = self.build_vocab([p[0] for p in self.pairs])
            self.tgt_vocab = self.build_vocab([p[1] for p in self.pairs])
        else:
            self.inp_vocab = inp_vocab
            self.tgt_vocab = tgt_vocab

    def build_vocab(self, texts):
        vocab = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
        idx = 4
        for text in texts:
            for ch in text:
                if ch not in vocab:
                    vocab[ch] = idx
                    idx += 1
        return vocab

    def encode(self, text, vocab):
        return [vocab.get(ch, vocab["<unk>"]) for ch in text]

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        inp, tgt = self.pairs[idx]
        inp_ids = [self.inp_vocab["<sos>"]] + self.encode(inp, self.inp_vocab) + [self.inp_vocab["<eos>"]]
        tgt_ids = [self.tgt_vocab["<sos>"]] + self.encode(tgt, self.tgt_vocab) + [self.tgt_vocab["<eos>"]]
        return torch.tensor(inp_ids), torch.tensor(tgt_ids)


def collate_fn(batch):
    inps, tgts = zip(*batch)
    inp_lens = [len(x) for x in inps]
    tgt_lens = [len(x) for x in tgts]
    max_inp = max(inp_lens)
    max_tgt = max(tgt_lens)
    inp_pad = torch.zeros(len(batch), max_inp, dtype=torch.long)
    tgt_pad = torch.zeros(len(batch), max_tgt, dtype=torch.long)
    for i, (inp, tgt) in enumerate(zip(inps, tgts)):
        inp_pad[i, :len(inp)] = inp
        tgt_pad[i, :len(tgt)] = tgt
    return inp_pad, tgt_pad


# Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, inp_vocab_size, tgt_vocab_size, d_model=256, nhead=4, num_layers=3, dim_feedforward=512, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.embedding_inp = nn.Embedding(inp_vocab_size, d_model)
        self.embedding_tgt = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoder = nn.Embedding(500, d_model)
        self.pos_decoder = nn.Embedding(500, d_model)

        self.transformer = nn.Transformer(
            d_model=d_model, nhead=nhead, num_encoder_layers=num_layers,
            num_decoder_layers=num_layers, dim_feedforward=dim_feedforward,
            dropout=dropout, batch_first=True
        )
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt):
        src_pos = torch.arange(0, src.size(1), device=src.device).unsqueeze(0)
        tgt_pos = torch.arange(0, tgt.size(1), device=src.device).unsqueeze(0)

        src_emb = self.embedding_inp(src) * math.sqrt(self.d_model) + self.pos_encoder(src_pos)
        tgt_emb = self.embedding_tgt(tgt) * math.sqrt(self.d_model) + self.pos_decoder(tgt_pos)

        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(src.device)

        output = self.transformer(src_emb, tgt_emb, tgt_mask=tgt_mask)
        return self.fc_out(output)


# -----------------------------
# Training & Evaluation
# -----------------------------
def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        tgt_inp = tgt[:, :-1]
        tgt_out = tgt[:, 1:]

        optimizer.zero_grad()
        output = model(src, tgt_inp)
        loss = criterion(output.reshape(-1, output.size(-1)), tgt_out.reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)


def evaluate(model, loader, criterion, inp_vocab, tgt_vocab, device, print_samples=False):
    model.eval()
    total, correct = 0, 0
    inv_tgt_vocab = {v: k for k, v in tgt_vocab.items()}
    inv_inp_vocab = {v: k for k, v in inp_vocab.items()}

    samples = []
    with torch.no_grad():
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)
            tgt_inp = tgt[:, :-1]
            tgt_out = tgt[:, 1:]
            output = model(src, tgt_inp)
            pred_tokens = output.argmax(-1)

            total += tgt_out.numel()
            correct += (pred_tokens == tgt_out).sum().item()

            if print_samples and len(samples) < 10:
                for i in range(min(5, src.size(0))):
                    inp_text = "".join(inv_inp_vocab.get(x.item(), "") for x in src[i] if x.item() > 3)
                    pred_text = "".join(inv_tgt_vocab.get(x.item(), "") for x in pred_tokens[i] if x.item() > 3)
                    truth_text = "".join(inv_tgt_vocab.get(x.item(), "") for x in tgt[i] if x.item() > 3)
                    samples.append((inp_text, pred_text, truth_text))

    acc = correct / total
    return acc, samples


# Main Training Script
def main():
    wandb.init(project="dakshina-transformer", config={
        "epochs": 25,
        "batch_size": 64,
        "lr": 0.001,
        "d_model": 256,
        "nhead": 4,
        "num_layers": 3,
        "dropout": 0.1
    })
    config = wandb.config

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_file = "/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
    dev_file = "/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv"

    train_dataset = DakshinaDataset(train_file, build_vocab=True)
    dev_dataset = DakshinaDataset(dev_file, inp_vocab=train_dataset.inp_vocab, tgt_vocab=train_dataset.tgt_vocab)

    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size, collate_fn=collate_fn)

    model = TransformerModel(len(train_dataset.inp_vocab), len(train_dataset.tgt_vocab),
                             d_model=config.d_model, nhead=config.nhead,
                             num_layers=config.num_layers, dropout=config.dropout).to(device)

    optimizer = optim.Adam(model.parameters(), lr=config.lr)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    best_acc = 0.0
    for epoch in range(1, config.epochs + 1):
        train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device)
        dev_acc, samples = evaluate(model, dev_loader, criterion,
                                    train_dataset.inp_vocab, train_dataset.tgt_vocab, device,
                                    print_samples=True)

        print(f"Epoch {epoch}, Train Loss: {train_loss:.4f}, Dev Accuracy: {dev_acc:.4f}")
        for inp, pred, truth in samples:
            print(f"{inp:15} | Pred: {pred:15} | Truth: {truth}")

        wandb.log({"epoch": epoch, "train_loss": train_loss, "dev_accuracy": dev_acc})
        

        if dev_acc > best_acc:
            best_acc = dev_acc
            torch.save(model.state_dict(), "best_transformer.pt")
            print("Best model saved.")
            
        print()    

    print("Training finished. Best Dev Accuracy:", best_acc)


if __name__ == "__main__":
    main()

Epoch 1, Train Loss: 1.3944, Dev Accuracy: 0.4414
अंकन            | Pred: ankan           | Truth: ankan
अंगकोर          | Pred: aggaorooooo     | Truth: angkor
अंगिरा          | Pred: angara          | Truth: angira
अंगीठी          | Pred: angathiiiiii    | Truth: angithi
अंग्रेज         | Pred: angrej          | Truth: angrej
अधिकत           | Pred: adhikttiiiii    | Truth: adhikat
अधिकांशत        | Pred: adhiksnnthtt    | Truth: adhikaanshat
अधिकांशत        | Pred: adhiksnthtta    | Truth: adhikanshat
अधिकाश          | Pred: adhisssshiii    | Truth: adhikaash
अधिकाश          | Pred: adhissshihiii   | Truth: adhikash
Best model saved.

Epoch 2, Train Loss: 0.7507, Dev Accuracy: 0.4860
अंकन            | Pred: anaan           | Truth: ankan
अंगकोर          | Pred: angkora         | Truth: angkor
अंगिरा          | Pred: angiraaaaaaa    | Truth: angira
अंगीठी          | Pred: angith          | Truth: angithi
अंग्रेज         | Pred: angrej          | Truth: angrej
अधिकत           | Pred: 

## For Test data

In [8]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import math
import csv
from collections import namedtuple

#### Single Run

In [9]:
# Load Dataset + Collate Fn
class DakshinaDataset(torch.utils.data.Dataset):
    def __init__(self, file_path, inp_vocab=None, tgt_vocab=None, build_vocab=False):
        self.pairs = []
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split("\t")
                if len(parts) >= 2:
                    self.pairs.append((parts[0], parts[1]))

        if build_vocab:
            self.inp_vocab = self.build_vocab([p[0] for p in self.pairs])
            self.tgt_vocab = self.build_vocab([p[1] for p in self.pairs])
        else:
            self.inp_vocab = inp_vocab
            self.tgt_vocab = tgt_vocab

    def build_vocab(self, texts):
        vocab = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
        idx = 4
        for text in texts:
            for ch in text:
                if ch not in vocab:
                    vocab[ch] = idx
                    idx += 1
        return vocab

    def encode(self, text, vocab):
        return [vocab.get(ch, vocab["<unk>"]) for ch in text]

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        inp, tgt = self.pairs[idx]
        inp_ids = [self.inp_vocab["<sos>"]] + self.encode(inp, self.inp_vocab) + [self.inp_vocab["<eos>"]]
        tgt_ids = [self.tgt_vocab["<sos>"]] + self.encode(tgt, self.tgt_vocab) + [self.tgt_vocab["<eos>"]]
        return torch.tensor(inp_ids), torch.tensor(tgt_ids)


def collate_fn(batch):
    inps, tgts = zip(*batch)
    max_inp = max(len(x) for x in inps)
    max_tgt = max(len(x) for x in tgts)
    inp_pad = torch.zeros(len(batch), max_inp, dtype=torch.long)
    tgt_pad = torch.zeros(len(batch), max_tgt, dtype=torch.long)
    for i, (inp, tgt) in enumerate(zip(inps, tgts)):
        inp_pad[i, :len(inp)] = inp
        tgt_pad[i, :len(tgt)] = tgt
    return inp_pad, tgt_pad


# Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, inp_vocab_size, tgt_vocab_size, d_model=256, nhead=4, num_layers=3, dim_feedforward=512, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.embedding_inp = nn.Embedding(inp_vocab_size, d_model)
        self.embedding_tgt = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoder = nn.Embedding(500, d_model)
        self.pos_decoder = nn.Embedding(500, d_model)

        self.transformer = nn.Transformer(
            d_model=d_model, nhead=nhead, num_encoder_layers=num_layers,
            num_decoder_layers=num_layers, dim_feedforward=dim_feedforward,
            dropout=dropout, batch_first=True
        )
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt):
        src_pos = torch.arange(0, src.size(1), device=src.device).unsqueeze(0)
        tgt_pos = torch.arange(0, tgt.size(1), device=src.device).unsqueeze(0)

        src_emb = self.embedding_inp(src) * math.sqrt(self.d_model) + self.pos_encoder(src_pos)
        tgt_emb = self.embedding_tgt(tgt) * math.sqrt(self.d_model) + self.pos_decoder(tgt_pos)

        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(src.device)

        output = self.transformer(src_emb, tgt_emb, tgt_mask=tgt_mask)
        return self.fc_out(output)


# Test Evaluation
def evaluate_test(model, loader, inp_vocab, tgt_vocab, device, save_csv=True):
    model.eval()
    total, correct = 0, 0
    inv_tgt_vocab = {v: k for k, v in tgt_vocab.items()}
    inv_inp_vocab = {v: k for k, v in inp_vocab.items()}

    samples = []
    preds_list = []

    with torch.no_grad():
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)
            tgt_inp = tgt[:, :-1]
            tgt_out = tgt[:, 1:]
            output = model(src, tgt_inp)
            pred_tokens = output.argmax(-1)

            total += tgt_out.numel()
            correct += (pred_tokens == tgt_out).sum().item()

            for i in range(src.size(0)):
                inp_text = "".join(inv_inp_vocab.get(x.item(), "") for x in src[i] if x.item() > 3)
                pred_text = "".join(inv_tgt_vocab.get(x.item(), "") for x in pred_tokens[i] if x.item() > 3)
                truth_text = "".join(inv_tgt_vocab.get(x.item(), "") for x in tgt[i] if x.item() > 3)
                preds_list.append([inp_text, pred_text, truth_text])
                if len(samples) < 10:
                    samples.append((inp_text, pred_text, truth_text))

    acc = correct / total
    print(f"\nTest Accuracy: {acc*100:.2f}%")
    for inp, pred, truth in samples:
        print(f"{inp:15} | Pred: {pred:20} | Truth: {truth}")

    if save_csv:
        df = pd.DataFrame(preds_list, columns=["Input", "Prediction", "Truth"])
        df.to_csv("test_predictions.csv", index=False)
        print("\nPredictions saved to: test_predictions.csv")

    return acc


# Run Test
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_file = "/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
    test_file = "/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv"

    # Load vocab from train set
    train_dataset = DakshinaDataset(train_file, build_vocab=True)
    test_dataset = DakshinaDataset(test_file, inp_vocab=train_dataset.inp_vocab, tgt_vocab=train_dataset.tgt_vocab)

    test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=collate_fn)

    # Load model
    model = TransformerModel(len(train_dataset.inp_vocab), len(train_dataset.tgt_vocab)).to(device)
    model.load_state_dict(torch.load("best_transformer.pt", map_location=device))
    print("\nLoaded best model for final evaluation...")

    # Evaluate
    acc = evaluate_test(model, test_loader, train_dataset.inp_vocab, train_dataset.tgt_vocab, device)


if __name__ == "__main__":
    main()


Loaded best model for final evaluation...

Test Accuracy: 53.34%
अंक             | Pred: ank                  | Truth: ank
अंक             | Pred: ank                  | Truth: anka
अंकित           | Pred: ankit                | Truth: ankit
अंकों           | Pred: ankkon               | Truth: anakon
अंकों           | Pred: ankoonoo             | Truth: ankhon
अंकों           | Pred: ankonoo              | Truth: ankon
अंकोर           | Pred: ankoor               | Truth: angkor
अंकोर           | Pred: ankor                | Truth: ankor
अंगारक          | Pred: angarrak             | Truth: angaarak
अंगारक          | Pred: angarak              | Truth: angarak

Predictions saved to: test_predictions.csv


#### 10-Epoch

In [10]:
# Dataset + Collate Fn
class DakshinaDataset(torch.utils.data.Dataset):
    def __init__(self, file_path, inp_vocab=None, tgt_vocab=None, build_vocab=False):
        self.pairs = []
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split("\t")
                if len(parts) >= 2:
                    self.pairs.append((parts[0], parts[1]))

        if build_vocab:
            self.inp_vocab = self.build_vocab([p[0] for p in self.pairs])
            self.tgt_vocab = self.build_vocab([p[1] for p in self.pairs])
        else:
            self.inp_vocab = inp_vocab
            self.tgt_vocab = tgt_vocab

    def build_vocab(self, texts):
        vocab = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
        idx = 4
        for text in texts:
            for ch in text:
                if ch not in vocab:
                    vocab[ch] = idx
                    idx += 1
        return vocab

    def encode(self, text, vocab):
        return [vocab.get(ch, vocab["<unk>"]) for ch in text]

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        inp, tgt = self.pairs[idx]
        inp_ids = [self.inp_vocab["<sos>"]] + self.encode(inp, self.inp_vocab) + [self.inp_vocab["<eos>"]]
        tgt_ids = [self.tgt_vocab["<sos>"]] + self.encode(tgt, self.tgt_vocab) + [self.tgt_vocab["<eos>"]]
        return torch.tensor(inp_ids), torch.tensor(tgt_ids)


def collate_fn(batch):
    inps, tgts = zip(*batch)
    max_inp = max(len(x) for x in inps)
    max_tgt = max(len(x) for x in tgts)
    inp_pad = torch.zeros(len(batch), max_inp, dtype=torch.long)
    tgt_pad = torch.zeros(len(batch), max_tgt, dtype=torch.long)
    for i, (inp, tgt) in enumerate(zip(inps, tgts)):
        inp_pad[i, :len(inp)] = inp
        tgt_pad[i, :len(tgt)] = tgt
    return inp_pad, tgt_pad


# Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, inp_vocab_size, tgt_vocab_size, d_model=256, nhead=4, num_layers=3, dim_feedforward=512, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.embedding_inp = nn.Embedding(inp_vocab_size, d_model)
        self.embedding_tgt = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoder = nn.Embedding(500, d_model)
        self.pos_decoder = nn.Embedding(500, d_model)

        self.transformer = nn.Transformer(
            d_model=d_model, nhead=nhead, num_encoder_layers=num_layers,
            num_decoder_layers=num_layers, dim_feedforward=dim_feedforward,
            dropout=dropout, batch_first=True
        )
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt):
        src_pos = torch.arange(0, src.size(1), device=src.device).unsqueeze(0)
        tgt_pos = torch.arange(0, tgt.size(1), device=src.device).unsqueeze(0)

        src_emb = self.embedding_inp(src) * math.sqrt(self.d_model) + self.pos_encoder(src_pos)
        tgt_emb = self.embedding_tgt(tgt) * math.sqrt(self.d_model) + self.pos_decoder(tgt_pos)

        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(src.device)

        output = self.transformer(src_emb, tgt_emb, tgt_mask=tgt_mask)
        return self.fc_out(output)


# Evaluation
def evaluate_test(model, loader, inp_vocab, tgt_vocab, device, show_samples=True, save_csv=False):
    model.eval()
    total, correct = 0, 0
    inv_tgt_vocab = {v: k for k, v in tgt_vocab.items()}
    inv_inp_vocab = {v: k for k, v in inp_vocab.items()}

    samples, preds_list = [], []

    with torch.no_grad():
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)
            tgt_inp = tgt[:, :-1]
            tgt_out = tgt[:, 1:]
            output = model(src, tgt_inp)
            pred_tokens = output.argmax(-1)

            total += tgt_out.numel()
            correct += (pred_tokens == tgt_out).sum().item()

            for i in range(src.size(0)):
                inp_text = "".join(inv_inp_vocab.get(x.item(), "") for x in src[i] if x.item() > 3)
                pred_text = "".join(inv_tgt_vocab.get(x.item(), "") for x in pred_tokens[i] if x.item() > 3)
                truth_text = "".join(inv_tgt_vocab.get(x.item(), "") for x in tgt[i] if x.item() > 3)
                preds_list.append([inp_text, pred_text, truth_text])
                if len(samples) < 10:
                    samples.append((inp_text, pred_text, truth_text))

    acc = correct / total
    if show_samples:
        print(f"\n Test Accuracy: {acc*100:.2f}%")
        for inp, pred, truth in samples:
            print(f"{inp:15} | Pred: {pred:20} | Truth: {truth}")

    if save_csv:
        df = pd.DataFrame(preds_list, columns=["Input", "Prediction", "Truth"])
        df.to_csv("test_predictions.csv", index=False)
        print("\n Predictions saved to: test_predictions.csv")

    return acc



# Training Loop
def train_model(model, train_loader, test_loader, inp_vocab, tgt_vocab, device, epochs=10):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    best_acc = 0.0
    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        for src, tgt in train_loader:
            src, tgt = src.to(device), tgt.to(device)
            tgt_inp = tgt[:, :-1]
            tgt_out = tgt[:, 1:]

            optimizer.zero_grad()
            output = model(src, tgt_inp)
            loss = criterion(output.reshape(-1, output.size(-1)), tgt_out.reshape(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"\nEpoch {epoch} Train Loss: {avg_loss:.4f}")

        acc = evaluate_test(model, test_loader, inp_vocab, tgt_vocab, device)

        if acc > best_acc:
            best_acc = acc
            torch.save(model.state_dict(), "best_transformer.pt")

    print("\n Loading best model for final evaluation...")
    model.load_state_dict(torch.load("best_transformer.pt", map_location=device))
    final_acc = evaluate_test(model, test_loader, inp_vocab, tgt_vocab, device, save_csv=True)
    return final_acc



# Main
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_file = "/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
    test_file = "/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv"

    # Load datasets
    train_dataset = DakshinaDataset(train_file, build_vocab=True)
    test_dataset = DakshinaDataset(test_file, inp_vocab=train_dataset.inp_vocab, tgt_vocab=train_dataset.tgt_vocab)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=collate_fn)

    # Init model
    model = TransformerModel(len(train_dataset.inp_vocab), len(train_dataset.tgt_vocab)).to(device)

    # Train + Eval
    train_model(model, train_loader, test_loader, train_dataset.inp_vocab, train_dataset.tgt_vocab, device, epochs=10)


if __name__ == "__main__":
    main()


Epoch 1 Train Loss: 1.3305

 Test Accuracy: 43.33%
अंक             | Pred: anka                 | Truth: ank
अंक             | Pred: ankan                | Truth: anka
अंकित           | Pred: ankitaattttt         | Truth: ankit
अंकों           | Pred: akkkonoooonn         | Truth: anakon
अंकों           | Pred: akkoononnnnn         | Truth: ankhon
अंकों           | Pred: akkonooooonn         | Truth: ankon
अंकोर           | Pred: akkooroorrrrr        | Truth: angkor
अंकोर           | Pred: akkoroorrrrrr        | Truth: ankor
अंगारक          | Pred: angarrakaaaaa        | Truth: angaarak
अंगारक          | Pred: angarakaaaaaa        | Truth: angarak

Epoch 2 Train Loss: 0.7339

 Test Accuracy: 50.55%
अंक             | Pred: ank                  | Truth: ank
अंक             | Pred: ank                  | Truth: anka
अंकित           | Pred: ankit                | Truth: ankit
अंकों           | Pred: ankkonnn             | Truth: anakon
अंकों           | Pred: ankoon               | Truth: