# model.py

In [13]:
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from queue import PriorityQueue

class SeqEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim,
                 num_layers, rnn_type, dropout):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        RNN = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[rnn_type]
        self.rnn  = RNN(embed_dim, hidden_dim, num_layers,
                        batch_first=True,
                        dropout=dropout if num_layers>1 else 0)
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        # x: [B, S]
        e, hid = self.drop(self.embed(x)), None
        out, hid = self.rnn(e)
        return out, hid

class SeqDecoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim,
                 num_layers, rnn_type, dropout):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        RNN = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[rnn_type]
        self.rnn  = RNN(embed_dim, hidden_dim, num_layers,
                        batch_first=True,
                        dropout=dropout if num_layers>1 else 0)
        self.out  = nn.Linear(hidden_dim, vocab_size)
        self.drop = nn.Dropout(dropout)

    def forward(self, token, hid):
        # token: [B]
        t = token.unsqueeze(1)               # [B,1]
        e = self.drop(self.embed(t))         # [B,1,E]
        o, h = self.rnn(e, hid)              # o:[B,1,H]
        return self.out(o.squeeze(1)), h     # logits, new hidden

class Seq2SeqModel(nn.Module):
    def __init__(self, encoder, decoder, pad_idx, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.pad_idx  = pad_idx
        self.device   = device

    def _align_hidden(self, enc_hidden):
        """
        Align encoder hidden (and cell for LSTM) to decoder.num_layers.
        """
        dec_layers = self.decoder.rnn.num_layers

        # LSTM: enc_hidden is (h, c)
        if isinstance(enc_hidden, tuple):
            h, c = enc_hidden
            enc_layers, B, H = h.size()
            h0 = torch.zeros(dec_layers, B, H, device=self.device)
            c0 = torch.zeros(dec_layers, B, H, device=self.device)
            n = min(enc_layers, dec_layers)
            h0[-n:] = h[-n:]
            c0[-n:] = c[-n:]
            return (h0, c0)

        # RNN/GRU: enc_hidden is tensor
        else:
            h = enc_hidden
            enc_layers, B, H = h.size()
            h0 = torch.zeros(dec_layers, B, H, device=self.device)
            n = min(enc_layers, dec_layers)
            h0[-n:] = h[-n:]
            return h0

    def forward(self, src, tgt, teacher_prob=0.5):
        """
        src: [B, S_src], tgt: [B, S_tgt]
        returns: [B, S_tgt, V]
        """
        B, T = tgt.size()
        V    = self.decoder.out.out_features
        out  = torch.zeros(B, T, V, device=self.device)

        # 1) encode
        _, enc_hid = self.encoder(src)
        # 2) align hidden to decoder layers
        dec_hid = self._align_hidden(enc_hid)
        # 3) decode step by step
        token = tgt[:,0]  # <sos>
        for t in range(1, T):
            logits, dec_hid = self.decoder(token, dec_hid)
            out[:,t] = logits
            token = tgt[:,t] if random.random() < teacher_prob else logits.argmax(1)
        return out

    def beam_search(self, src, sos_idx, eos_idx, beam_k=3, max_len=32):
        """
        Greedy beam search for a single sequence src: [S_src]
        returns: list of token indices
        """
        self.eval()
        with torch.no_grad():
            _, enc_hid = self.encoder(src.unsqueeze(0))
            dec_hid    = self._align_hidden(enc_hid)

            class Node:
                def __init__(self, hid, prev, tok, logp, length):
                    self.hid, self.prev, self.tok = hid, prev, tok
                    self.logp, self.len = logp, length
                def score(self): return self.logp / self.len

            root = Node(dec_hid, None,
                        torch.tensor([sos_idx], device=self.device),
                        0.0, 1)

            pq, completed = PriorityQueue(), []
            pq.put((-root.score(), root))

            while not pq.empty():
                _, node = pq.get()
                if node.tok.item() == eos_idx and node.prev:
                    completed.append((node.score(), node))
                    if len(completed) >= beam_k: break

                logits, new_hid = self.decoder(node.tok, node.hid)
                logps  = F.log_softmax(logits, dim=1).squeeze(0)
                topv, topi = logps.topk(beam_k)
                for i in range(beam_k):
                    tok_i    = topi[i].unsqueeze(0)
                    new_node = Node(new_hid, node, tok_i,
                                    node.logp + topv[i].item(),
                                    node.len+1)
                    pq.put((-new_node.score(), new_node))

            best = sorted(completed, key=lambda x: x[0], reverse=True)[0][1]
            seq  = []
            while best.prev:
                seq.append(best.tok.item())
                best = best.prev
            return seq[::-1]


In [4]:
import wandb
wandb.login(key="f0880f1a8675dc5a9ff218689c5340669690b6e0")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33med24s401[0m ([33med24s401-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

### train.py

In [14]:

import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import wandb
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

DATA_FOLDER = "/kaggle/input/translit"
LANG        = "hi"
device      = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SWEEP_CONFIG = {
    "method": "bayes",
    "metric": {"name": "val_acc", "goal": "maximize"},
    "parameters": {
        "embed_dim":      {"values": [16, 32, 64, 256]},
        "hidden_dim":     {"values": [16, 32, 64, 256]},
        "encoder_layers": {"values": [1, 2, 3]},
        "decoder_layers": {"values": [1, 2, 3]},
        "rnn_type":       {"values": ["RNN", "GRU", "LSTM"]},
        "dropout":        {"values": [0.2, 0.3]},
        "beam_size":      {"values": [1, 3, 5]},
        "learning_rate":  {"values": [1e-2, 1e-3, 5e-4]},
        "batch_size":     {"values": [32, 64]},
        "epochs":         {"value": 10},
        "max_len":        {"value": 32},
        "language":       {"value": LANG},
    }
}


# ────────────────────────────────────────────────────────────────────────────────
# 4) HELPERS
# ────────────────────────────────────────────────────────────────────────────────

def compute_accuracy(logits, targets, pad_idx):
    preds = logits.argmax(-1)
    mask  = (targets != pad_idx)
    return ((preds == targets) & mask).sum().float() / mask.sum().float()

def collate_batch(batch, pad_src, pad_tgt):
    srcs, tgts = zip(*batch)
    srcs_p = pad_sequence(srcs, batch_first=True, padding_value=pad_src)
    tgts_p = pad_sequence(tgts, batch_first=True, padding_value=pad_tgt)
    return srcs_p, tgts_p


# ────────────────────────────────────────────────────────────────────────────────
# 5) TRAIN & VALID LOOPS
# ────────────────────────────────────────────────────────────────────────────────

def train_epoch(model, loader, optimizer, loss_fn, pad_idx):
    model.train()
    total_loss, total_acc = 0.0, 0.0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        out = model(src, tgt, teacher_prob=0.5)
        B, T, V = out.shape
        loss = loss_fn(
            out[:,1:,:].reshape(-1, V),
            tgt[:,1:].reshape(-1)
        )
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_acc  += compute_accuracy(out[:,1:,:], tgt[:,1:], pad_idx).item()
    return total_loss/len(loader), total_acc/len(loader)

@torch.no_grad()
def validate_epoch(model, loader, loss_fn, pad_idx):
    model.eval()
    total_loss, total_acc = 0.0, 0.0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        out = model(src, tgt, teacher_prob=0.0)
        B, T, V = out.shape
        total_loss += loss_fn(
            out[:,1:,:].reshape(-1, V),
            tgt[:,1:].reshape(-1)
        ).item()
        total_acc  += compute_accuracy(out[:,1:,:], tgt[:,1:], pad_idx).item()
    return total_loss/len(loader), total_acc/len(loader)


# ────────────────────────────────────────────────────────────────────────────────
# 6) CORE EXPERIMENT
# ────────────────────────────────────────────────────────────────────────────────

def run_experiment():
    wandb.init()
    cfg = wandb.config

    # build vocab from train split
    train_fp = os.path.join(DATA_FOLDER, f"{cfg.language}.translit.sampled.train.tsv")
    df = pd.read_csv(train_fp, sep="\t", names=["tgt","src","_"], usecols=[0,1]).dropna()
    src_map = create_mapping(df["src"].tolist())
    tgt_map = create_mapping(df["tgt"].tolist())

    # datasets & loaders
    ds_train = TranslitDataset(
        os.path.join(DATA_FOLDER, f"{cfg.language}.translit.sampled.train.tsv"),
        src_map, tgt_map, cfg.max_len
    )
    ds_dev  = TranslitDataset(
        os.path.join(DATA_FOLDER, f"{cfg.language}.translit.sampled.dev.tsv"),
        src_map, tgt_map, cfg.max_len
    )
    ds_test = TranslitDataset(
        os.path.join(DATA_FOLDER, f"{cfg.language}.translit.sampled.test.tsv"),
        src_map, tgt_map, cfg.max_len
    )

    pad_src = src_map["<pad>"]
    pad_tgt = tgt_map["<pad>"]
    train_loader = DataLoader(
        ds_train, batch_size=cfg.batch_size, shuffle=True,
        collate_fn=lambda b: collate_batch(b, pad_src, pad_tgt)
    )
    dev_loader  = DataLoader(
        ds_dev,  batch_size=cfg.batch_size, shuffle=False,
        collate_fn=lambda b: collate_batch(b, pad_src, pad_tgt)
    )
    test_loader = DataLoader(
        ds_test, batch_size=cfg.batch_size, shuffle=False,
        collate_fn=lambda b: collate_batch(b, pad_src, pad_tgt)
    )

    # model / optimizer / loss
    enc = SeqEncoder(
        vocab_size=len(src_map),
        embed_dim=cfg.embed_dim,
        hidden_dim=cfg.hidden_dim,
        num_layers=cfg.encoder_layers,
        rnn_type=cfg.rnn_type,
        dropout=cfg.dropout
    )
    dec = SeqDecoder(
        vocab_size=len(tgt_map),
        embed_dim=cfg.embed_dim,
        hidden_dim=cfg.hidden_dim,
        num_layers=cfg.decoder_layers,
        rnn_type=cfg.rnn_type,
        dropout=cfg.dropout
    )
    model     = Seq2SeqModel(enc, dec, pad_src, device).to(device)
    optimizer = optim.Adam(model.parameters(), lr=cfg.learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=pad_src)

    # training loop
    for epoch in range(1, cfg.epochs + 1):
        tr_loss, tr_acc = train_epoch(model, train_loader, optimizer, criterion, pad_src)
        dv_loss, dv_acc = validate_epoch(model, dev_loader,     criterion, pad_src)
        wandb.log({
            "epoch":       epoch,
            "train_loss":  tr_loss, "train_acc":  tr_acc,
            "val_loss":    dv_loss, "val_acc":    dv_acc
        })

    # final test
    ts_loss, ts_acc = validate_epoch(model, test_loader, criterion, pad_src)
    wandb.log({"test_loss": ts_loss, "test_acc": ts_acc})

# ────────────────────────────────────────────────────────────────────────────────
# 7) LAUNCH SWEEP
# ────────────────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    sweep_id = wandb.sweep(SWEEP_CONFIG, project="dakshina_translit_assignment3")
    wandb.agent(sweep_id, function=run_experiment)

Create sweep with ID: 9pjlvkha
Sweep URL: https://wandb.ai/ed24s401-indian-institute-of-technology-madras/dakshina_translit_assignment3/sweeps/9pjlvkha


[34m[1mwandb[0m: Agent Starting Run: ia9khor6 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 32
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	language: hi
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	max_len: 32
[34m[1mwandb[0m: 	rnn_type: RNN


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_acc,▁
test_loss,▁
train_acc,▁▂▃▃▄▆▆▇██
train_loss,█▇▆▅▅▃▂▂▁▁
val_acc,▁▁▂▂▄▆▇▇▇█
val_loss,███▆▅▃▃▂▂▁

0,1
epoch,10.0
test_acc,0.34347
test_loss,2.45404
train_acc,0.37952
train_loss,2.2375
val_acc,0.33723
val_loss,2.4726


[34m[1mwandb[0m: Agent Starting Run: 5wrfoonl with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	decoder_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	language: hi
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	max_len: 32
[34m[1mwandb[0m: 	rnn_type: GRU


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_acc,▁
test_loss,▁
train_acc,▁▆▇▇██████
train_loss,█▂▂▂▁▁▁▁▁▁
val_acc,▁▆▇▆▆█▆▆██
val_loss,█▂▂▂▃▁▃▃▂▃

0,1
epoch,10.0
test_acc,0.52615
test_loss,1.59153
train_acc,0.55302
train_loss,1.44179
val_acc,0.51681
val_loss,1.64251


[34m[1mwandb[0m: Agent Starting Run: odt75z6a with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	decoder_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_dim: 16
[34m[1mwandb[0m: 	encoder_layers: 3
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	language: hi
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	max_len: 32
[34m[1mwandb[0m: 	rnn_type: RNN


0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_acc,▁
test_loss,▁
train_acc,▁▅▇▆▇▇▇▇██
train_loss,█▃▂▃▂▂▂▂▁▂
val_acc,███▆▁████▇
val_loss,▂▄▂▇█▁▁▃▂▃

0,1
epoch,10.0
test_acc,0.20003
test_loss,3.16905
train_acc,0.21963
train_loss,2.96866
val_acc,0.19251
val_loss,3.19301


[34m[1mwandb[0m: Agent Starting Run: 8eexzyf1 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	decoder_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	language: hi
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	max_len: 32
[34m[1mwandb[0m: 	rnn_type: RNN


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


# FINAL


In [None]:
import wandb
wandb.login(key="f0880f1a8675dc5a9ff218689c5340669690b6e0")

In [3]:
# train_dakshina.py

import os, random
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import wandb
from queue import PriorityQueue

# ────────────────────────────────────────────────────────────────────────────────
# 1) CONFIGURE YOUR DATA LOCATION & LANGUAGE
# ────────────────────────────────────────────────────────────────────────────────

# Change these if needed:
DATA_DIR = "/kaggle/input/translit"       
LANG     = "hi"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ────────────────────────────────────────────────────────────────────────────────
# 2) CHAR VOCAB & DATASET
# ────────────────────────────────────────────────────────────────────────────────

def build_char_vocab(texts, specials=['<pad>','<sos>','<eos>']):
    """Map every character in `texts` plus specials to unique ints."""
    chars = set("".join(texts))
    m = {tok:i for i, tok in enumerate(specials)}
    for ch in sorted(chars):
        if ch not in m:
            m[ch] = len(m)
    return m

class TranslitDataset(Dataset):
    """
    Expects a TSV with columns [tgt, src, _]. Drops nulls.
    Returns (src_ids, tgt_ids) where
      src: [char...] + <eos>
      tgt: <sos> + [char...] + <eos>
    """
    def __init__(self, path, src_map, tgt_map, max_len=32):
        df = pd.read_csv(path, sep="\t",
                         names=["tgt","src","_"], usecols=[0,1])\
               .dropna().astype(str)
        self.pairs   = list(zip(df["src"], df["tgt"]))
        self.src_map = src_map
        self.tgt_map = tgt_map
        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, i):
        src, tgt = self.pairs[i]
        src_ids = [ self.src_map.get(c, self.src_map["<pad>"])
                   for c in src ][:self.max_len] \
                 + [self.src_map["<eos>"]]
        tgt_ids = [self.tgt_map["<sos>"]] \
                 + [ self.tgt_map.get(c, self.tgt_map["<pad>"])
                     for c in tgt ][:self.max_len] \
                 + [self.tgt_map["<eos>"]]
        return torch.tensor(src_ids), torch.tensor(tgt_ids)

def collate_batch(batch, pad_src, pad_tgt):
    srcs, tgts = zip(*batch)
    srcs_p = pad_sequence(srcs, batch_first=True, padding_value=pad_src)
    tgts_p = pad_sequence(tgts, batch_first=True, padding_value=pad_tgt)
    return srcs_p, tgts_p

# ────────────────────────────────────────────────────────────────────────────────
# 3) SEQ2SEQ MODEL + BEAM SEARCH
# ────────────────────────────────────────────────────────────────────────────────

class SeqEncoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hid_size, n_layers, cell, dropout):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_size)
        RNN = {"RNN":nn.RNN, "GRU":nn.GRU, "LSTM":nn.LSTM}[cell]
        self.rnn  = RNN(emb_size, hid_size, n_layers,
                        batch_first=True,
                        dropout=dropout if n_layers>1 else 0)
        self.drop = nn.Dropout(dropout)
    def forward(self, x):
        e, h = self.drop(self.embed(x)), None
        out, h = self.rnn(e)
        return out, h

class SeqDecoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hid_size, n_layers, cell, dropout):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_size)
        RNN = {"RNN":nn.RNN, "GRU":nn.GRU, "LSTM":nn.LSTM}[cell]
        self.rnn  = RNN(emb_size, hid_size, n_layers,
                        batch_first=True,
                        dropout=dropout if n_layers>1 else 0)
        self.fc   = nn.Linear(hid_size, vocab_size)
        self.drop = nn.Dropout(dropout)
    def forward(self, token, hid):
        # token: [B]
        t = token.unsqueeze(1)           # [B,1]
        e = self.drop(self.embed(t))     # [B,1,emb]
        o, h = self.rnn(e, hid)          # [B,1,hid]
        return self.fc(o.squeeze(1)), h  # logits [B,vocab], new hidden

class Seq2Seq(nn.Module):
    def __init__(self, enc, dec, pad_idx, device):
        super().__init__()
        self.enc     = enc
        self.dec     = dec
        self.pad_idx = pad_idx
        self.device  = device

    def _align_hidden(self, hidden):
        """Copy/truncate encoder hidden to match decoder layers."""
        dec_layers = self.dec.rnn.num_layers
        if isinstance(hidden, tuple):  # LSTM
            h, c = hidden
            enc_layers, B, H = h.size()
            h0 = torch.zeros(dec_layers,B,H,device=self.device)
            c0 = torch.zeros(dec_layers,B,H,device=self.device)
            n = min(enc_layers, dec_layers)
            h0[-n:], c0[-n:] = h[-n:], c[-n:]
            return (h0, c0)
        else:  # GRU/RNN
            h = hidden
            enc_layers, B, H = h.size()
            h0 = torch.zeros(dec_layers,B,H,device=self.device)
            n = min(enc_layers, dec_layers)
            h0[-n:] = h[-n:]
            return h0

    def forward(self, src, tgt, teacher_prob=0.5):
        B, T = tgt.size()
        V    = self.dec.fc.out_features
        logits = torch.zeros(B,T,V,device=self.device)

        _, enc_hid = self.enc(src)
        dec_hid    = self._align_hidden(enc_hid)
        token      = tgt[:,0]

        for t in range(1, T):
            step_logit, dec_hid = self.dec(token, dec_hid)
            logits[:,t,:]       = step_logit
            teacher_tok         = tgt[:,t]
            mask = torch.rand(B,device=self.device) < teacher_prob
            token = torch.where(mask, teacher_tok, step_logit.argmax(1))

        return logits

# beam search
class BeamNode:
    def __init__(self, hid, prev, token, logp, length):
        self.hid, self.prev, self.token = hid, prev, token
        self.logp, self.len = logp, length
    def score(self): return self.logp/self.len

def beam_decode(model, src_seq, sos, eos, beam_k, max_len):
    model.eval()
    with torch.no_grad():
        _, enc_hid = model.enc(src_seq.unsqueeze(0))
        dec_hid    = model._align_hidden(enc_hid)

        pq = PriorityQueue()
        counter = 0
        root = BeamNode(dec_hid,None,torch.tensor([sos],device=src_seq.device),0.0,1)
        pq.put((-root.score(),counter,root))

        completed = []
        while not pq.empty():
            _,_,node = pq.get()
            if node.token.item()==eos and node.prev is not None:
                completed.append((node.score(),node))
                if len(completed)>=beam_k: break

            logits, new_hid = model.dec(node.token,node.hid)
            logps = F.log_softmax(logits,dim=1).squeeze(0)
            topv,topi = logps.topk(beam_k)
            for i in range(beam_k):
                counter+=1
                tok_i = topi[i].unsqueeze(0)
                new_node = BeamNode(new_hid,node,tok_i,
                                    node.logp+topv[i].item(),
                                    node.len+1)
                pq.put((-new_node.score(),counter,new_node))

        best = sorted(completed,key=lambda x:x[0],reverse=True)[0][1]
        seq=[]
        while best.prev:
            seq.append(best.token.item())
            best=best.prev
        return seq[::-1]

def beam_accuracy(model, loader, sos, eos, pad, beam_k, max_len):
    model.eval()
    hits, total=0,0
    with torch.no_grad():
        for src,tgt in loader:
            src,tgt=src.to(model.device),tgt.to(model.device)
            B,T=tgt.size()
            for i in range(B):
                pred_seq = beam_decode(model,src[i],sos,eos,beam_k,max_len)
                for j,p in enumerate(pred_seq):
                    if j>=T: break
                    if tgt[i,j]==pad: continue
                    total+=1
                    if p==tgt[i,j].item(): hits+=1
    return hits/total

# ────────────────────────────────────────────────────────────────────────────────
# 4) SWEEP CONFIGURATION
# ────────────────────────────────────────────────────────────────────────────────

SWEEP_CONFIG = {
    "method":"bayes",
    "metric":{"name":"val_accuracy","goal":"maximize"},
    "parameters":{
        "emb_size":     {"values":[16,32,64,256]},
        "hid_size":     {"values":[16,32,64,256]},
        "enc_layers":   {"values":[1,2,3]},
        "dec_layers":   {"values":[1,2,3]},
        "cell_type":    {"values":["RNN","GRU","LSTM"]},
        "dropout":      {"values":[0.2,0.3]},
        "beam_size":    {"values":[1,3,5]},
        "learning_rate":{"values":[1e-2,1e-3,5e-4]},
        "batch_size":   {"values":[32,64]},
        "epochs":       {"value":10},
        "max_len":      {"value":32},
        "language":     {"value":LANG},
    }
}

# ────────────────────────────────────────────────────────────────────────────────
# 5) TRAIN & VALID HELPERS
# ────────────────────────────────────────────────────────────────────────────────

def compute_accuracy(logits, targets, pad_idx):
    preds = logits.argmax(-1)
    mask  = (targets!=pad_idx)
    return ((preds==targets)&mask).sum().float()/mask.sum().float()

def train_epoch(model,loader,opt,loss_fn,pad):
    model.train(); L,A=0,0
    for src,tgt in loader:
        src,tgt=src.to(DEVICE),tgt.to(DEVICE)
        opt.zero_grad()
        out=model(src,tgt,teacher_prob=0.5)
        B,T,V=out.shape
        loss=loss_fn(out[:,1:,:].reshape(-1,V),tgt[:,1:].reshape(-1))
        loss.backward(); opt.step()
        L+=loss.item(); A+=compute_accuracy(out[:,1:,:],tgt[:,1:],pad).item()
    return L/len(loader),A/len(loader)

@torch.no_grad()
def valid_epoch(model,loader,loss_fn,pad):
    model.eval(); L,A=0,0
    for src,tgt in loader:
        src,tgt=src.to(DEVICE),tgt.to(DEVICE)
        out=model(src,tgt,teacher_prob=0.0)
        B,T,V=out.shape
        L+=loss_fn(out[:,1:,:].reshape(-1,V),tgt[:,1:].reshape(-1)).item()
        A+=compute_accuracy(out[:,1:,:],tgt[:,1:],pad).item()
    return L/len(loader),A/len(loader)

# ────────────────────────────────────────────────────────────────────────────────
# 6) SWEEP RUN
# ────────────────────────────────────────────────────────────────────────────────

def run_sweep():
    wandb.init()
    cfg = wandb.config

    # build vocabs on train split
    train_fp = os.path.join(DATA_DIR,f"{cfg.language}.translit.sampled.train.tsv")
    df       = pd.read_csv(train_fp,sep="\t",
                           names=["tgt","src","_"],usecols=[0,1]).dropna()
    src_map  = build_char_vocab(df["src"].tolist())
    tgt_map  = build_char_vocab(df["tgt"].tolist())

    pad_src, pad_tgt = src_map["<pad>"], tgt_map["<pad>"]
    sos, eos         = src_map["<sos>"], tgt_map["<eos>"]

    # dataloaders
    train_ds = TranslitDataset(train_fp, src_map,tgt_map,cfg.max_len)
    dev_ds   = TranslitDataset(os.path.join(DATA_DIR,f"{cfg.language}.translit.sampled.dev.tsv"),
                               src_map,tgt_map,cfg.max_len)

    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True,
                              collate_fn=lambda b: collate_batch(b,pad_src,pad_tgt))
    dev_loader   = DataLoader(dev_ds,   batch_size=cfg.batch_size, shuffle=False,
                              collate_fn=lambda b: collate_batch(b,pad_src,pad_tgt))

    # model + optim + loss
    enc   = SeqEncoder(len(src_map),cfg.emb_size,cfg.hid_size,
                       cfg.enc_layers,cfg.cell_type,cfg.dropout)
    dec   = SeqDecoder(len(tgt_map),cfg.emb_size,cfg.hid_size,
                       cfg.dec_layers,cfg.cell_type,cfg.dropout)
    model = Seq2Seq(enc,dec,pad_src,DEVICE).to(DEVICE)
    opt   = optim.Adam(model.parameters(),lr=cfg.learning_rate)
    crit  = nn.CrossEntropyLoss(ignore_index=pad_src)

    # training loop
    for epoch in range(1,cfg.epochs+1):
        tr_l,tr_a = train_epoch(model,train_loader,opt,crit,pad_src)
        dv_l,dv_a = valid_epoch(model,  dev_loader,  crit,pad_src)
        if cfg.beam_size>1:
            # override dev-accuracy with beam-search accuracy
            dv_a = beam_accuracy(model,dev_loader,sos,eos,pad_src,
                                  cfg.beam_size,cfg.max_len)
        wandb.log({
            "epoch": epoch,
            "train_loss": tr_l, "train_acc": tr_a,
            "val_loss":   dv_l, "val_accuracy": dv_a
        })

    # save best
    torch.save(model.state_dict(),"best_model.pt")
    wandb.save("best_model.pt")

if __name__=="__main__":
    sweep_id = wandb.sweep(SWEEP_CONFIG, project="dakshina_translit_a3")
    wandb.agent(sweep_id, function=run_sweep, count=50)


Create sweep with ID: 1pnxa203
Sweep URL: https://wandb.ai/ed24s401-indian-institute-of-technology-madras/dakshina_translit_a3/sweeps/1pnxa203


[34m[1mwandb[0m: Agent Starting Run: 7yoydgtw with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_size: 16
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hid_size: 64
[34m[1mwandb[0m: 	language: hi
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	max_len: 32


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▄▅▆▇▇▇███
train_loss,█▅▄▃▂▂▂▁▁▁
val_accuracy,▁▅▆▄██▄▇▆▇
val_loss,█▅▄▃▂▂▁▁▁▁

0,1
epoch,10.0
train_acc,0.64154
train_loss,1.20805
val_accuracy,0.17947
val_loss,1.36789


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 5riheo8m with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_size: 16
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hid_size: 256
[34m[1mwandb[0m: 	language: hi
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_len: 32


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▅▆▇▇▇████
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,█▅▄▄▅▁▂▁▂▃
val_loss,█▃▂▂▁▁▁▁▁▂

0,1
epoch,10.0
train_acc,0.88607
train_loss,0.37513
val_accuracy,0.15242
val_loss,1.0935


[34m[1mwandb[0m: Agent Starting Run: rmfrhxt6 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_size: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hid_size: 32
[34m[1mwandb[0m: 	language: hi
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	max_len: 32


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▄▄▅▆▇▇▇██
train_loss,█▅▄▄▃▃▂▂▁▁
val_accuracy,▄▄▁▇▆▇▅▆█▇
val_loss,▇██▆▅▃▄▂▂▁

0,1
epoch,10.0
train_acc,0.25949
train_loss,2.76914
val_accuracy,0.14419
val_loss,2.9606


[34m[1mwandb[0m: Agent Starting Run: 6pjag4kb with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_size: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hid_size: 64
[34m[1mwandb[0m: 	language: hi
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_len: 32


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▄▆▆▇▇▇███
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▄█▅▅▅▂▂▄▁▁
val_loss,█▅▃▃▂▂▁▁▁▁

0,1
epoch,10.0
train_acc,0.69646
train_loss,0.98344
val_accuracy,0.17879
val_loss,1.22099


[34m[1mwandb[0m: Agent Starting Run: k0ulia1l with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_size: 32
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hid_size: 256
[34m[1mwandb[0m: 	language: hi
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_len: 32


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▅▆▇▇▇▇███
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,█▆▁▂▄▃▁▂▁▂
val_loss,█▄▃▂▁▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.84912
train_loss,0.49927
val_accuracy,0.17025
val_loss,1.15613


[34m[1mwandb[0m: Agent Starting Run: jul866kj with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_size: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hid_size: 16
[34m[1mwandb[0m: 	language: hi
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	max_len: 32


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▃▄▅▆▆▇▇██
train_loss,█▅▅▄▃▃▂▂▁▁
val_accuracy,▂▁▂▃▃▃▆▇█▇
val_loss,█▆▅▅▄▃▂▂▂▁

0,1
epoch,10.0
train_acc,0.35224
train_loss,2.28422
val_accuracy,0.16953
val_loss,2.52873


[34m[1mwandb[0m: Agent Starting Run: ie36om32 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_size: 16
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hid_size: 256
[34m[1mwandb[0m: 	language: hi
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_len: 32


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▅▆▇▇▇████
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,█▅▄▃▂▁▃▂▂▃
val_loss,█▃▂▁▁▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.87665
train_loss,0.40938
val_accuracy,0.14937
val_loss,1.03533


[34m[1mwandb[0m: Agent Starting Run: ixliuvc6 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_size: 32
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hid_size: 32
[34m[1mwandb[0m: 	language: hi
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	max_len: 32


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▅▆▆▇█████
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▃▃▃▁▆▇█▆▆▇
val_loss,█▇▇▄▃▃▁▂▂▁

0,1
epoch,10.0
train_acc,0.22915
train_loss,2.9226
val_accuracy,0.20258
val_loss,3.1173


[34m[1mwandb[0m: Agent Starting Run: rjzyyx8i with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_size: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hid_size: 64
[34m[1mwandb[0m: 	language: hi
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	max_len: 32


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▆▇▇███▇▆▇
train_loss,█▃▂▂▁▁▁▁▁▁
val_accuracy,▁▄▁▅▅▁▁▁▁█
val_loss,▅█▃▂▅▂▆▇▁▁

0,1
epoch,10.0
train_acc,0.21969
train_loss,2.95425
val_accuracy,0.12257
val_loss,3.20963


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: n1r8cmy4 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_size: 16
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hid_size: 16
[34m[1mwandb[0m: 	language: hi
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_len: 32
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


### FINAL FINAL

In [4]:
import wandb
wandb.login(key="f0880f1a8675dc5a9ff218689c5340669690b6e0")

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33med24s401[0m ([33med24s401-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
# train.py

import os
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import wandb
from queue import PriorityQueue

# ---- 1. Data + Vocab ----
DATA_DIR   = '/kaggle/input/translit'
TRAIN_FILE = os.path.join(DATA_DIR, 'hi.translit.sampled.train.tsv')
DEV_FILE   = os.path.join(DATA_DIR, 'hi.translit.sampled.dev.tsv')

def build_vocab(seqs, specials=['<pad>','<sos>','<eos>']):
    chars = set(''.join(seqs))
    idx   = {tok:i for i,tok in enumerate(specials)}
    for c in sorted(chars):
        idx[c] = len(idx)
    return idx

# read train to build vocabs
_df = pd.read_csv(TRAIN_FILE, sep='\t', names=['dev','rom','_']).dropna()
SRC_VOCAB = build_vocab(_df['rom'])
TGT_VOCAB = build_vocab(_df['dev'])

class TransliterationDataset(Dataset):
    def __init__(self, path, src_vocab, tgt_vocab, max_len=32):
        df = pd.read_csv(path, sep='\t', names=['dev','rom','_']).dropna()
        self.pairs = df[['rom','dev']].values.tolist()
        self.src_vocab, self.tgt_vocab = src_vocab, tgt_vocab
        self.max_len = max_len

    def __len__(self): return len(self.pairs)

    def __getitem__(self, i):
        rom, dev = self.pairs[i]
        src_ids = [self.src_vocab[c] for c in rom][:self.max_len] + [self.src_vocab['<eos>']]
        tgt_ids = [self.tgt_vocab['<sos>']] + [self.tgt_vocab[c] for c in dev][:self.max_len] + [self.tgt_vocab['<eos>']]
        return torch.tensor(src_ids), torch.tensor(tgt_ids)

def collate_fn(batch):
    srcs, tgts = zip(*batch)
    src_pad = pad_sequence(srcs, batch_first=True, padding_value=SRC_VOCAB['<pad>'])
    tgt_pad = pad_sequence(tgts, batch_first=True, padding_value=TGT_VOCAB['<pad>'])
    return src_pad, tgt_pad

# ---- 2. Model ----
class Encoder(nn.Module):
    def __init__(self, inp_dim, emb_dim, hid_dim, n_layers, cell, dropout):
        super().__init__()
        self.emb = nn.Embedding(inp_dim, emb_dim)
        RNN = {'RNN':nn.RNN,'LSTM':nn.LSTM,'GRU':nn.GRU}[cell]
        self.rnn = RNN(emb_dim, hid_dim, n_layers,
                       batch_first=True,
                       dropout=dropout if n_layers>1 else 0)
        self.drop = nn.Dropout(dropout)
    def forward(self, x):
        e = self.drop(self.emb(x))
        out, hidden = self.rnn(e)
        return out, hidden

class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim*2, hid_dim)
        self.v    = nn.Linear(hid_dim,1,bias=False)
    def forward(self, hidden, enc_out):
        # hidden: [B,H], enc_out:[B,S,H]
        B,S,H = enc_out.size()
        h_exp = hidden.unsqueeze(1).repeat(1,S,1)
        energy= torch.tanh(self.attn(torch.cat([h_exp, enc_out],dim=2)))
        scores= self.v(energy).squeeze(2)
        return F.softmax(scores,dim=1)

class Decoder(nn.Module):
    def __init__(self, out_dim, emb_dim, hid_dim, n_layers, cell, dropout, use_attn=False):
        super().__init__()
        RNN = {'RNN':nn.RNN,'LSTM':nn.LSTM,'GRU':nn.GRU}[cell]
        self.emb = nn.Embedding(out_dim, emb_dim)
        self.use_attn = use_attn
        if use_attn: self.attn = Attention(hid_dim)
        in_dim = emb_dim + (hid_dim if use_attn else 0)
        self.rnn = RNN(in_dim, hid_dim, n_layers, batch_first=True,
                       dropout=dropout if n_layers>1 else 0)
        self.fc  = nn.Linear(hid_dim, out_dim)
        self.drop= nn.Dropout(dropout)

    def forward(self, tok, hidden, enc_out=None):
        # tok: [B]
        emb = self.drop(self.emb(tok).unsqueeze(1))  # [B,1,E]
        if self.use_attn:
            h_last = hidden[0][-1] if isinstance(hidden,tuple) else hidden[-1]
            w = self.attn(h_last, enc_out)             # [B,S]
            ctx = torch.bmm(w.unsqueeze(1), enc_out)   # [B,1,H]
            r_in = torch.cat([emb,ctx],dim=2)
        else:
            r_in = emb
        out, h2 = self.rnn(r_in, hidden)
        pred = self.fc(out.squeeze(1))
        return pred, h2, (w if self.use_attn else None)

class Seq2Seq(nn.Module):
    def __init__(self, enc, dec, pad_idx, device):
        super().__init__()
        self.enc, self.dec = enc, dec
        self.pad_idx, self.device = pad_idx, device

    def _init_dec_hidden(self, h_enc):
        dec_layers = self.dec.rnn.num_layers
        if isinstance(h_enc, tuple):
            h,c = h_enc
            n,B,H = h.size()
            h0 = h.new_zeros(dec_layers,B,H)
            c0 = c.new_zeros(dec_layers,B,H)
            ncp = min(n,dec_layers)
            h0[-ncp:], c0[-ncp:] = h[-ncp:], c[-ncp:]
            return (h0,c0)
        else:
            h = h_enc
            n,B,H = h.size()
            h0 = h.new_zeros(dec_layers,B,H)
            ncp = min(n,dec_layers)
            h0[-ncp:] = h[-ncp:]
            return h0

    def forward(self, src, tgt, teacher_forcing=0.5):
        B,T = tgt.size()
        V   = self.dec.fc.out_features
        outputs = torch.zeros(B,T,V,device=self.device)
        enc_out, enc_h = self.enc(src)
        dec_h = self._init_dec_hidden(enc_h)
        inp   = tgt[:,0]
        for t in range(1,T):
            pred, dec_h, _ = self.dec(inp, dec_h, enc_out if self.dec.use_attn else None)
            outputs[:,t] = pred
            top1 = pred.argmax(1)
            if torch.rand(1).item() < teacher_forcing:
                inp = tgt[:,t]
            else:
                inp = top1
        return outputs


class BeamNode:
    def __init__(self, h, prev, tok, lp, length):
        self.h, self.prev, self.tok, self.lp, self.length = h, prev, tok, lp, length
    def score(self): return self.lp / self.length

def beam_decode(model, src, beam_k=3, max_len=32):
    model.eval()
    with torch.no_grad():
        enc_out, h0 = model.enc(src)
        start = torch.tensor([TGT_VOCAB['<sos>']],device=src.device)
        root  = BeamNode(h0,None,start,0.0,1)
        pq = PriorityQueue(); pq.put((-root.score(),root))
        finished=[]
        while not pq.empty():
            _,node = pq.get()
            if node.tok.item()==TGT_VOCAB['<eos>'] and node.prev:
                finished.append((node.score(),node))
                if len(finished)>=beam_k: break
            pred,h1,_ = model.dec(node.tok,node.h,enc_out if model.dec.use_attn else None)
            logps    = F.log_softmax(pred,1)
            topv,topi=logps.topk(beam_k)
            for i in range(beam_k):
                nt = topi[0][i].unsqueeze(0)
                nl = node.lp + topv[0][i].item()
                child = BeamNode(h1,node,nt,nl,node.length+1)
                pq.put((-child.score(),child))
        best = max(finished,key=lambda x:x[0])[1]
        seq=[]
        while best.prev:
            seq.append(best.tok.item())
            best=best.prev
        return seq[::-1]

# ---- 3. Training & Sweep ----
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
PAD_IDX = TGT_VOCAB['<pad>']

sweep_cfg = {
  'method':'bayes',
  'metric':{'name':'val_loss','goal':'minimize'},
  'parameters':{
    'emb_dim':   {'values':[16,32,64,256]},
    'hid_dim':   {'values':[16,32,64,256]},
    'enc_layers':{'values':[1,2,3]},
    'dec_layers':{'values':[1,2,3]},
    'cell':      {'values':['RNN','GRU','LSTM']},
    'dropout':   {'values':[0.2,0.3]},
    'beam_k':    {'values':[1,3,5]},
    'lr':        {'value':1e-3},
    'batch_size':{'value':128}
  }
}

def compute_acc(preds, tgts):
    with torch.no_grad():
        guess = preds.argmax(2)
        mask  = tgts != PAD_IDX
        corr  = (guess==tgts)&mask
        return corr.sum().float()/mask.sum().float()

def epoch_step(model, loader, opt, crit, train=True):
    model.train() if train else model.eval()
    tot_loss, tot_acc = 0.0, 0.0
    for src, tgt in loader:
        src,tgt = src.to(DEVICE), tgt.to(DEVICE)
        if train: opt.zero_grad()
        out = model(src,tgt, teacher_forcing=0.5 if train else 0.0)
        logits = out[:,1:].reshape(-1,out.size(-1))
        gold   = tgt[:,1:].reshape(-1)
        loss = crit(logits,gold)
        if train:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),1)
            opt.step()
        tot_loss += loss.item()
        tot_acc  += compute_acc(out[:,1:],tgt[:,1:])
    return tot_loss/len(loader), tot_acc/len(loader)

def sweep_run():
    wandb.init()
    cfg = wandb.config

    train_ds = TransliterationDataset(TRAIN_FILE, SRC_VOCAB, TGT_VOCAB)
    dev_ds   = TransliterationDataset(DEV_FILE,   SRC_VOCAB, TGT_VOCAB)
    train_ld = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, collate_fn=collate_fn)
    dev_ld   = DataLoader(dev_ds,   batch_size=cfg.batch_size,               collate_fn=collate_fn)

    enc = Encoder(len(SRC_VOCAB), cfg.emb_dim, cfg.hid_dim, cfg.enc_layers, cfg.cell, cfg.dropout)
    dec = Decoder(len(TGT_VOCAB), cfg.emb_dim, cfg.hid_dim, cfg.dec_layers, cfg.cell, cfg.dropout, use_attn=False)
    model = Seq2Seq(enc,dec,PAD_IDX,DEVICE).to(DEVICE)

    optimizer = optim.Adam(model.parameters(), lr=cfg.lr)
    criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

    for epoch in range(1,11):
        tr_l, tr_a = epoch_step(model, train_ld, optimizer, criterion, train=True)
        vl_l, vl_a = epoch_step(model, dev_ld,   optimizer, criterion, train=False)
        wandb.log({
            'epoch':epoch,
            'train_loss':tr_l, 'train_acc':tr_a,
            'val_loss':vl_l,   'val_acc':vl_a
        })

    torch.save(model.state_dict(),'best_model.pt')
    wandb.save('best_model.pt')

if __name__=='__main__':
    sweep_id = wandb.sweep(sweep_cfg, project='assignment3_sweep')
    wandb.agent(sweep_id, function=sweep_run)


Create sweep with ID: mn8d5jqz
Sweep URL: https://wandb.ai/ed24s401-indian-institute-of-technology-madras/assignment3_sweep/sweeps/mn8d5jqz


[34m[1mwandb[0m: Agent Starting Run: bz9ds9ds with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 1
[34m[1mwandb[0m: 	cell: RNN
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	hid_dim: 64
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▄▅▅▆▆▇▇▇█
train_loss,█▅▄▃▃▃▂▂▂▁
val_acc,▂▁▁▃▄▅▃▆▇█
val_loss,▇█▇▅▅▄▅▃▃▁

0,1
epoch,10.0
train_acc,0.26656
train_loss,2.74506
val_acc,0.23036
val_loss,2.96674


[34m[1mwandb[0m: Agent Starting Run: ifl0iwt4 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 3
[34m[1mwandb[0m: 	cell: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	hid_dim: 32
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▅▆▆▇▇▇███
train_loss,█▃▃▂▂▂▁▁▁▁
val_acc,█▁▁▆▆█▆██▆
val_loss,▄█▅▃▃▃▄▁▂▄

0,1
epoch,10.0
train_acc,0.21821
train_loss,2.97161
val_acc,0.19203
val_loss,3.18007


[34m[1mwandb[0m: Agent Starting Run: mx1agz91 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 1
[34m[1mwandb[0m: 	cell: RNN
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▄▅▆▆▇▇█▇█
train_loss,█▅▄▃▃▂▂▂▁▁
val_acc,▆█▆█▆▁███▁
val_loss,▃▁▂▂▄█▁▂▃█

0,1
epoch,10.0
train_acc,0.22981
train_loss,2.88479
val_acc,0.14786
val_loss,3.3919


[34m[1mwandb[0m: Agent Starting Run: 3q9z5f2s with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 3
[34m[1mwandb[0m: 	cell: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hid_dim: 64
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▃▄▄▄▆▇▇██
train_loss,█▅▅▄▄▃▂▂▁▁
val_acc,▇▆▇▁▆█▇▇█▇
val_loss,▃▃▃█▃▁▂▂▁▁

0,1
epoch,10.0
train_acc,0.27337
train_loss,2.70876
val_acc,0.206
val_loss,3.03518


[34m[1mwandb[0m: Agent Starting Run: cpc5no6l with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 5
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	hid_dim: 64
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▃▅▆▆▇▇██
train_loss,█▇▆▄▃▂▂▁▁▁
val_acc,▁▂▄▅▆▇▇███
val_loss,█▇▅▃▃▂▂▁▁▁

0,1
epoch,10.0
train_acc,0.64865
train_loss,1.15118
val_acc,0.60152
val_loss,1.33992


[34m[1mwandb[0m: Agent Starting Run: gpcydwts with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 3
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	hid_dim: 64
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▃▅▆▆▇▇██
train_loss,█▇▅▄▃▂▂▁▁▁
val_acc,▁▂▄▅▆▇▇███
val_loss,█▇▅▃▂▂▂▁▁▁

0,1
epoch,10.0
train_acc,0.64256
train_loss,1.17448
val_acc,0.59421
val_loss,1.35456


[34m[1mwandb[0m: Agent Starting Run: bi154ljy with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 5
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hid_dim: 64
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▄▅▆▇▇▇██
train_loss,█▆▅▄▃▂▂▁▁▁
val_acc,▁▃▄▅▆▇▇███
val_loss,█▆▅▃▃▂▂▁▁▁

0,1
epoch,10.0
train_acc,0.63758
train_loss,1.18886
val_acc,0.59809
val_loss,1.33744


[34m[1mwandb[0m: Agent Starting Run: 3jhdofj1 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 5
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▃▅▆▇▇████
train_loss,█▆▃▂▂▂▁▁▁▁
val_acc,▁▅▇▇██████
val_loss,█▄▂▂▁▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.86337
train_loss,0.4644
val_acc,0.70995
val_loss,1.0596


[34m[1mwandb[0m: Agent Starting Run: bb823j24 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 5
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hid_dim: 64
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▄▅▆▇▇▇██
train_loss,█▆▅▄▃▂▂▁▁▁
val_acc,▁▃▄▆▆▇▇███
val_loss,█▆▄▃▂▂▁▁▁▁

0,1
epoch,10.0
train_acc,0.67446
train_loss,1.0778
val_acc,0.6261
val_loss,1.28499


[34m[1mwandb[0m: Agent Starting Run: 6cqu47iy with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 5
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hid_dim: 64
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▃▅▆▆▇▇██
train_loss,█▇▅▄▃▂▂▁▁▁
val_acc,▁▂▄▅▆▆▇▇██
val_loss,█▇▅▄▃▂▂▁▁▁

0,1
epoch,10.0
train_acc,0.6343
train_loss,1.22377
val_acc,0.58241
val_loss,1.42703


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: acjlj0o3 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 5
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▄▆▆▇▇▇███
train_loss,█▅▃▂▂▂▁▁▁▁
val_acc,▁▅▆▇▇█████
val_loss,█▃▂▁▁▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.87046
train_loss,0.43358
val_acc,0.72044
val_loss,1.04819


[34m[1mwandb[0m: Agent Starting Run: 8o30un88 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 5
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▅▆▇▇▇████
train_loss,█▄▃▂▂▂▁▁▁▁
val_acc,▁▅▆▇▇▇████
val_loss,█▃▂▂▁▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.87742
train_loss,0.41148
val_acc,0.72027
val_loss,1.07602


[34m[1mwandb[0m: Agent Starting Run: uullz6h2 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 5
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▄▆▆▇▇▇███
train_loss,█▅▃▂▂▂▁▁▁▁
val_acc,▁▅▆▇▇▇████
val_loss,█▃▂▂▁▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.83109
train_loss,0.56091
val_acc,0.69839
val_loss,1.0729


[34m[1mwandb[0m: Agent Starting Run: 9pzdtwcz with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 5
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▅▆▇▇▇███
train_loss,█▆▄▃▂▂▂▁▁▁
val_acc,▁▃▆▇▇█████
val_loss,█▅▃▂▂▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.8421
train_loss,0.52435
val_acc,0.71491
val_loss,1.0172


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: g0y44ysa with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 5
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▄▆▆▇▇███
train_loss,█▆▄▃▂▂▂▁▁▁
val_acc,▁▄▅▆▇▇████
val_loss,█▅▃▂▂▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.77335
train_loss,0.73836
val_acc,0.67697
val_loss,1.09036


[34m[1mwandb[0m: Agent Starting Run: dr07g071 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 5
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	hid_dim: 64
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▃▄▅▆▆▇▇█
train_loss,█▇▆▅▄▃▂▂▁▁
val_acc,▁▂▃▄▄▆▆▇██
val_loss,█▇▆▅▄▃▂▂▁▁

0,1
epoch,10.0
train_acc,0.53711
train_loss,1.55742
val_acc,0.50164
val_loss,1.68782


[34m[1mwandb[0m: Agent Starting Run: 5oswjyrx with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 5
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▃▄▅▆▇▇██
train_loss,█▇▆▅▃▂▂▂▁▁
val_acc,▁▂▃▅▆▇▇███
val_loss,█▇▆▄▃▂▁▁▁▁

0,1
epoch,10.0
train_acc,0.79677
train_loss,0.67363
val_acc,0.68479
val_loss,1.11814


[34m[1mwandb[0m: Agent Starting Run: pabdo4i5 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 5
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▄▆▆▇▇▇███
train_loss,█▅▃▂▂▂▁▁▁▁
val_acc,▁▅▆▇██████
val_loss,█▃▂▂▁▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.84813
train_loss,0.5068
val_acc,0.70379
val_loss,1.08085


[34m[1mwandb[0m: Agent Starting Run: sdz0mook with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 1
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▄▆▆▇▇▇███
train_loss,█▅▃▂▂▂▁▁▁▁
val_acc,▁▅▆▇▇▇████
val_loss,█▃▂▂▁▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.83168
train_loss,0.55908
val_acc,0.70057
val_loss,1.05628


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 3ymg6t2l with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 5
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▄▆▆▇▇███
train_loss,█▆▄▃▂▂▂▁▁▁
val_acc,▁▃▅▆▇▇████
val_loss,█▅▃▂▂▂▁▁▁▁

0,1
epoch,10.0
train_acc,0.76974
train_loss,0.75303
val_acc,0.67086
val_loss,1.12501


[34m[1mwandb[0m: Agent Starting Run: 3epwvjo7 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 1
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	hid_dim: 64
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▃▄▅▆▇▇██
train_loss,█▆▅▄▃▂▂▂▁▁
val_acc,▁▂▃▄▅▆▇▇██
val_loss,█▇▅▄▃▂▂▁▁▁

0,1
epoch,10.0
train_acc,0.58819
train_loss,1.36989
val_acc,0.52892
val_loss,1.60203


[34m[1mwandb[0m: Agent Starting Run: 9n7bvcbe with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 5
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▃▅▆▇▇▇███
train_loss,█▆▃▃▂▂▁▁▁▁
val_acc,▁▄▆▇▇▇████
val_loss,█▄▃▂▂▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.82603
train_loss,0.57285
val_acc,0.71099
val_loss,1.03573


[34m[1mwandb[0m: Agent Starting Run: pf1do4fi with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 3
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▄▆▆▇▇▇███
train_loss,█▄▃▂▂▂▁▁▁▁
val_acc,▁▅▆▇▇▇████
val_loss,█▄▃▂▂▂▁▁▁▁

0,1
epoch,10.0
train_acc,0.82887
train_loss,0.57103
val_acc,0.69482
val_loss,1.10762


[34m[1mwandb[0m: Agent Starting Run: yzffvv29 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 3
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▅▆▇▇▇▇███
train_loss,█▄▃▂▂▂▁▁▁▁
val_acc,▁▅▆▇▇▇████
val_loss,█▄▃▂▂▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.83658
train_loss,0.55017
val_acc,0.69225
val_loss,1.11989


[34m[1mwandb[0m: Agent Starting Run: 382l8y4z with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 3
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▃▅▆▇▇▇███
train_loss,█▆▄▃▂▂▁▁▁▁
val_acc,▁▄▆▇▇▇████
val_loss,█▅▂▂▂▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.82548
train_loss,0.57361
val_acc,0.70775
val_loss,1.03182


[34m[1mwandb[0m: Agent Starting Run: 5r9vg03d with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 3
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▃▄▆▆▇▇███
train_loss,█▆▄▃▂▂▂▁▁▁
val_acc,▁▄▅▆▇▇████
val_loss,█▅▃▂▂▂▁▁▁▁

0,1
epoch,10.0
train_acc,0.76968
train_loss,0.7516
val_acc,0.67049
val_loss,1.13328


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: pzh8wpw2 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 5
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▄▆▆▇▇███
train_loss,█▆▄▃▂▂▂▁▁▁
val_acc,▁▄▆▆▇▇████
val_loss,█▅▃▂▂▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.79769
train_loss,0.66408
val_acc,0.69711
val_loss,1.04117


[34m[1mwandb[0m: Agent Starting Run: ij9khtul with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 3
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▃▅▆▇▇▇██
train_loss,█▇▅▄▃▂▂▁▁▁
val_acc,▁▃▄▆▇▇████
val_loss,█▆▄▂▂▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.81476
train_loss,0.61275
val_acc,0.7068
val_loss,1.09401


[34m[1mwandb[0m: Agent Starting Run: 0ur2nnmq with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 1
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▄▆▇▇▇████
train_loss,█▅▃▂▂▂▁▁▁▁
val_acc,▁▅▆▇▇█████
val_loss,█▃▂▂▁▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.86123
train_loss,0.46228
val_acc,0.71973
val_loss,1.03706


[34m[1mwandb[0m: Agent Starting Run: e41lo9go with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 1
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▄▅▆▇▇███
train_loss,█▆▅▃▂▂▂▁▁▁
val_acc,▁▃▅▆▇▇████
val_loss,█▅▃▂▂▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.81379
train_loss,0.61449
val_acc,0.71247
val_loss,1.04207


[34m[1mwandb[0m: Agent Starting Run: uvpa80zo with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 3
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▃▅▆▇▇▇███
train_loss,█▆▄▃▂▂▁▁▁▁
val_acc,▁▃▅▇▇▇████
val_loss,█▅▃▂▂▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.83065
train_loss,0.5598
val_acc,0.71487
val_loss,1.03767


[34m[1mwandb[0m: Agent Starting Run: rtj6cui0 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 5
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▄▅▆▇▇███
train_loss,█▇▅▃▂▂▂▁▁▁
val_acc,▁▃▅▆▇▇████
val_loss,█▆▄▂▂▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.80747
train_loss,0.63125
val_acc,0.69418
val_loss,1.06996


[34m[1mwandb[0m: Agent Starting Run: 2qjrab42 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 3
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 16
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▃▅▆▇▇███
train_loss,█▇▆▄▃▂▂▁▁▁
val_acc,▁▂▄▆▇▇████
val_loss,█▇▅▃▂▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.81063
train_loss,0.61949
val_acc,0.70251
val_loss,1.03927


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: mbgqhb3w with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 3
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▃▅▆▇▇▇███
train_loss,█▆▄▃▂▂▁▁▁▁
val_acc,▁▄▆▇▇▇████
val_loss,█▅▃▂▂▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.84842
train_loss,0.50289
val_acc,0.71772
val_loss,1.04972


[34m[1mwandb[0m: Agent Starting Run: ijebv8r3 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 1
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▃▅▆▇▇▇███
train_loss,█▆▃▂▂▂▁▁▁▁
val_acc,▁▄▆▇▇▇▇███
val_loss,█▄▂▂▁▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.81167
train_loss,0.61674
val_acc,0.69803
val_loss,1.06147


[34m[1mwandb[0m: Agent Starting Run: wqfoi28j with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 3
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▄▆▆▇▇▇███
train_loss,█▅▃▂▂▂▁▁▁▁
val_acc,▁▅▇▇▇█████
val_loss,█▃▂▁▁▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.86759
train_loss,0.44329
val_acc,0.72207
val_loss,1.05899


[34m[1mwandb[0m: Agent Starting Run: 9j11461w with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_k: 1
[34m[1mwandb[0m: 	cell: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 32
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	hid_dim: 256
[34m[1mwandb[0m: 	lr: 0.001
