In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
import wandb
import pandas as pd
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, rnn_type='LSTM',
                 dropout=0.2, bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.is_bidirectional = bidirectional
        self.rnn_type = rnn_type
        self.num_directions = 2 if bidirectional else 1
        self.hidden_dim = hidden_dim

        rnn_cls = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[rnn_type]
        self.rnn = rnn_cls(
            input_size=embed_dim,
            hidden_size=hidden_dim // self.num_directions,
            num_layers=num_layers,
            dropout=dropout,
            batch_first=True,
            bidirectional=bidirectional
        )

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, hidden = self.rnn(embedded)
        return hidden


class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, rnn_type='LSTM',
                 dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn_type = rnn_type

        rnn_cls = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[rnn_type]
        self.rnn = rnn_cls(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_token, hidden_state):
        embedded = self.embedding(input_token.unsqueeze(1))  # (B, 1, E)
        rnn_output, hidden = self.rnn(embedded, hidden_state)
        logits = self.fc_out(rnn_output.squeeze(1))  # (B, V)
        return logits, hidden


class TransliterationModel(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, embed_dim, hidden_dim,
                 enc_layers, dec_layers, rnn_type='LSTM', dropout=0.2, bidirectional=False):
        super().__init__()
        self.encoder = Encoder(input_vocab_size, embed_dim, hidden_dim,
                                    enc_layers, rnn_type, dropout, bidirectional)
        self.decoder = Decoder(output_vocab_size, embed_dim, hidden_dim,
                                     dec_layers, rnn_type, dropout)
        self.rnn_type = rnn_type
        self.bidirectional = bidirectional
        self.hidden_dim = hidden_dim
        self.enc_layers = enc_layers
        self.dec_layers = dec_layers

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size, tgt_len = tgt.shape
        vocab_size = self.decoder.fc_out.out_features
        outputs = torch.zeros(batch_size, tgt_len, vocab_size, device=src.device)

        enc_hidden = self.encoder(src)

        def merge_bidir_states(state):
            return torch.cat([state[::2], state[1::2]], dim=2)

        def pad_layers(state, target_layers):
            if state.size(0) == target_layers:
                return state
            pad = torch.zeros(target_layers - state.size(0), *state.shape[1:], device=state.device)
            return torch.cat([state, pad], dim=0)

        if self.rnn_type == 'LSTM':
            h, c = enc_hidden
            if self.bidirectional:
                h, c = merge_bidir_states(h), merge_bidir_states(c)
            h, c = pad_layers(h, self.dec_layers), pad_layers(c, self.dec_layers)
            dec_hidden = (h, c)
        else:
            h = enc_hidden
            if self.bidirectional:
                h = merge_bidir_states(h)
            h = pad_layers(h, self.dec_layers)
            dec_hidden = h

        dec_input = tgt[:, 0]  # Start token
        for t in range(1, tgt_len):
            output, dec_hidden = self.decoder(dec_input, dec_hidden)
            outputs[:, t] = output
            top1 = output.argmax(1)
            teacher_force = random.random() < teacher_forcing_ratio
            dec_input = tgt[:, t] if teacher_force else top1

        return outputs

def read_pairs(file_path):
    with open(file_path, encoding='utf-8') as f:
        return [(line.split('\t')[1], line.split('\t')[0]) for line in f.read().strip().split('\n') if '\t' in line]

def build_vocab_and_prepare_batch(seqs, device):
    special_tokens = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
    
    # Build character sets
    unique_chars_latin = sorted(set(ch for seq in seqs for ch in seq[0]))
    unique_chars_dev = sorted(set(ch for seq in seqs for ch in seq[1]))

    # Build vocabularies
    src_vocab = {ch: idx + len(special_tokens) for idx, ch in enumerate(unique_chars_latin)}
    tgt_vocab = {ch: idx + len(special_tokens) for idx, ch in enumerate(unique_chars_dev)}
    src_vocab.update(special_tokens)
    tgt_vocab.update(special_tokens)

    idx2src = {idx: ch for ch, idx in src_vocab.items()}
    idx2tgt = {idx: ch for ch, idx in tgt_vocab.items()}

    def encode_text(seq, vocab):
        return [vocab.get(ch, vocab['<unk>']) for ch in seq]

    def create_batch(pairs):
        src = [torch.tensor(encode_text(x, src_vocab) + [src_vocab['<eos>']]) for x, _ in pairs]
        tgt = [torch.tensor([tgt_vocab['<sos>']] + encode_text(y, tgt_vocab) + [tgt_vocab['<eos>']]) for _, y in pairs]
        src = pad_sequence(src, batch_first=True, padding_value=src_vocab['<pad>'])
        tgt = pad_sequence(tgt, batch_first=True, padding_value=tgt_vocab['<pad>'])
        return src.to(device), tgt.to(device)

    return src_vocab, idx2src, tgt_vocab, idx2tgt, create_batch, unique_chars_latin, unique_chars_dev

def compute_word_level_accuracy(preds, targets, vocab):
    sos, eos, pad = vocab['<sos>'], vocab['<eos>'], vocab['<pad>']
    preds = preds.tolist()
    targets = targets.tolist()
    correct = 0
    for p, t in zip(preds, targets):
        p = [x for x in p if x != pad and x != eos]
        t = [x for x in t if x != pad and x != eos]
        if p == t:
            correct += 1
    return correct / len(preds) * 100

def run_training():
    # Initialize wandb config
    wandb.init()
    cfg = wandb.config
    wandb.run.name = (
    f"es_{cfg.embedding_size}_hs_{cfg.hidden_size}_"
    f"enc_{cfg.enc_layers}_dec_{cfg.dec_layers}_"
    f"rnn_{cfg.rnn_type}_dropout_{cfg.dropout_rate}_"
    f"bidirectional_{cfg.is_bidirectional}_"
    f"lr_{cfg.learning_rate}_bs_{cfg.batch_size}_"
    f"epochs_{cfg.epochs}_tfp_{cfg.teacher_forcing_prob}_"
    f"beam_size_{cfg.beam_size}"
    )


    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load and prepare data
    train_path = "dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
    dev_path = "dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv"
    train_set = read_pairs(train_path)
    dev_set = read_pairs(dev_path)

    src_vocab, idx2src, tgt_vocab, idx2tgt, create_batch, _, _ = build_vocab_and_prepare_batch(train_set, device)

    # Initialize model, optimizer, criterion
    model = TransliterationModel(
        len(src_vocab), len(tgt_vocab), cfg.embedding_size, cfg.hidden_size,
        cfg.enc_layers, cfg.dec_layers, cfg.rnn_type, cfg.dropout_rate,
        cfg.is_bidirectional
    ).to(device)
    optimizer = optim.Adam(model.parameters(), lr=cfg.learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab['<pad>'])

    # Training loop
    for epoch in range(cfg.epochs):
        model.train()
        total_loss, total_acc = 0, 0
        random.shuffle(train_set)

        for i in range(0, len(train_set), cfg.batch_size):
            batch = train_set[i:i+cfg.batch_size]
            src, tgt = create_batch(batch)

            optimizer.zero_grad()
            outputs = model(src, tgt, cfg.teacher_forcing_prob)

            loss = criterion(outputs[:, 1:].reshape(-1, outputs.size(-1)), tgt[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()

            preds = outputs.argmax(-1)
            acc = compute_word_level_accuracy(preds[:, 1:], tgt[:, 1:], tgt_vocab)

            total_loss += loss.item()
            total_acc += acc

        avg_train_loss = total_loss / (len(train_set) // cfg.batch_size)
        avg_train_acc = total_acc / (len(train_set) // cfg.batch_size)

        # Validation
        model.eval()
        dev_loss, dev_acc = 0, 0
        with torch.no_grad():
            for i in range(0, len(dev_set), cfg.batch_size):
                batch = dev_set[i:i+cfg.batch_size]
                src, tgt = create_batch(batch)
                outputs = model(src, tgt, 0)
                loss = criterion(outputs[:, 1:].reshape(-1, outputs.size(-1)), tgt[:, 1:].reshape(-1))

                preds = outputs.argmax(-1)
                acc = compute_word_level_accuracy(preds[:, 1:], tgt[:, 1:], tgt_vocab)

                dev_loss += loss.item()
                dev_acc += acc

        avg_dev_loss = dev_loss / (len(dev_set) // cfg.batch_size)
        avg_dev_acc = dev_acc / (len(dev_set) // cfg.batch_size)

        # Logging
        wandb.log({
            "Epoch": epoch + 1,
            "Train Loss": avg_train_loss,
            "Train Accuracy": avg_train_acc,
            "Validation Loss": avg_dev_loss,
            "Validation Accuracy": avg_dev_acc,
        })

        print(f"Epoch {epoch+1}/{cfg.epochs} | Train Loss: {avg_train_loss:.4f}, Train Acc: {avg_train_acc:.2f}% | Val Loss: {avg_dev_loss:.4f}, Val Acc: {avg_dev_acc:.2f}%")

    wandb.finish()
    return model


In [None]:
# sweep_config = {
#     'method': 'bayes',
#     'metric': {'name': 'Validation Accuracy', 'goal': 'maximize'},
#     'parameters': {
#         'embedding_size': {'values': [128, 256]},
#         'hidden_size': {'values': [128, 256]},
#         'enc_layers': {'values': [2, 3]},
#         'dec_layers': {'values': [2, 3]},
#         'rnn_type': {'values': ['GRU', 'LSTM','RNN']},
#         'dropout_rate': {'values': [0.2, 0.3]},
#         'batch_size': {'values': [32, 64]},
#         'epochs': {
#             'values': [5, 10]},
#         'is_bidirectional': {'values': [False, True]},
#         'learning_rate': {'values': [0.001, 0.0001]},
#         'optimizer': {'values': ['adam', 'nadam']},
#         'teacher_forcing_prob': {'values': [0.2, 0.5, 0.7]},
#         'beam_size': {'values': [1,3,5]},
#     }
# }

# sweep_id = wandb.sweep(sweep_config, project="dakshina_transliteration")
# wandb.agent(sweep_id, function=run_training, count=50)


In [2]:
def model_eval(cfg):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    wandb.finish()
    wandb.init(
        project="transliteration_evaluation",
        name = 'best_model_test_eval',
        resume="never",
        reinit=True,
        config=cfg
    )
    # Load and prepare data
    model_path = "best_vanilla_model.pt"
    train_path = "dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
    train_set = read_pairs(train_path)
    test_path = "dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv"
    test_set = read_pairs(test_path)

    src_vocab, idx2src, tgt_vocab, idx2tgt, create_batch, _, _ = build_vocab_and_prepare_batch(train_set, device)

    # Initialize model, optimizer, criterion
    model = TransliterationModel(
        len(src_vocab), len(tgt_vocab), cfg['embedding_size'], cfg['hidden_size'],
        cfg['enc_layers'], cfg['dec_layers'], cfg['rnn_type'], cfg['dropout_rate'],
        cfg['is_bidirectional']
    ).to(device)
    if not os.path.exists(model_path):
        print("❌ No saved model found, starting training.")
        optimizer = optim.Adam(model.parameters(), lr=cfg['learning_rate'])
        criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab['<pad>'])
        best_acc = 0.0
        # Training loop
        for epoch in range(cfg['epochs']):
            model.train()
            total_loss, total_acc = 0, 0
            random.shuffle(train_set)

            for i in range(0, len(train_set), cfg['batch_size']):
                batch = train_set[i:i+cfg['batch_size']]
                src, tgt = create_batch(batch)

                optimizer.zero_grad()
                outputs = model(src, tgt, cfg['teacher_forcing_prob'])

                loss = criterion(outputs[:, 1:].reshape(-1, outputs.size(-1)), tgt[:, 1:].reshape(-1))
                loss.backward()
                optimizer.step()

                preds = outputs.argmax(-1)
                acc = compute_word_level_accuracy(preds[:, 1:], tgt[:, 1:], tgt_vocab)

                total_loss += loss.item()
                total_acc += acc

            avg_train_loss = total_loss / (len(train_set) // cfg['batch_size'])
            avg_train_acc = total_acc / (len(train_set) // cfg['batch_size'])

            print(f"Epoch {epoch+1}/{cfg['epochs']} | Train Loss: {avg_train_loss:.4f}, Train Acc: {avg_train_acc:.2f}%")
            wandb.log({"Train Loss": avg_train_loss, "Train Accuracy": avg_train_acc})

            # Save the best model
            if avg_train_acc > best_acc:
                best_acc = avg_train_acc
                torch.save(model.state_dict(), model_path)
                print(f"💾 Saved new best model at epoch {epoch + 1} with accuracy {best_acc:.2f}%")
        print(f"Best model saved with accuracy: {best_acc:.2f}%")

    #test the model
    if os.path.exists(model_path):
        model.load_state_dict(torch.load(model_path))
        print("✅ Loaded saved model from disk.")
    model.eval()
    predictions = []

    with torch.no_grad():
        for i in range(0, len(test_set), cfg['batch_size']):
            batch = test_set[i:i + cfg['batch_size']]
            src, tgt = create_batch(batch)
            outputs = model(src, tgt, 0)
            preds = outputs.argmax(-1)

            for j in range(src.size(0)):
                input_seq = ''.join([idx2src.get(idx.item(), '') for idx in src[j] if idx.item() not in [src_vocab['<pad>'], src_vocab['<eos>']]])
                target_seq = ''.join([idx2tgt.get(idx.item(), '') for idx in tgt[j][1:] if idx.item() not in [tgt_vocab['<pad>'], tgt_vocab['<eos>']]])
                pred_seq = ''.join([idx2tgt.get(idx.item(), '') for idx in preds[j][1:] if idx.item() not in [tgt_vocab['<pad>'], tgt_vocab['<eos>']]])
                is_correct = target_seq == pred_seq
                predictions.append({'Input': input_seq, 'Target': target_seq, 'Predicted': pred_seq , 'Is_Correct': "True✅" if is_correct else "False❌"})
    predictions = pd.DataFrame(predictions)
    overall_acc = (predictions.Is_Correct == "True✅").mean()
    wandb.log({"Test Accuracy": overall_acc})
    table = wandb.Table(dataframe=predictions)
    wandb.log({"predictions_table_vanilla": table})
    # finish run
    wandb.finish()
    predictions.to_csv('predictions_vanilla.csv', index=False)
    print(f"Saved {len(predictions)} rows, eval accuracy = {overall_acc:.2f}")


In [None]:
parameters = {
        'embedding_size':256,
        'hidden_size': 256,
        'enc_layers': 3,
        'dec_layers': 3,
        'rnn_type': 'GRU',
        'dropout_rate': 0.3,
        'batch_size': 64,
        'epochs':10,
        'is_bidirectional':False,
        'learning_rate': 0.001,
        'optimizer': 'nadam',
        'teacher_forcing_prob':0.7,
        'beam_size': 5,
    }
# model_eval(parameters)

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: harshtrivs (harshtrivs-indian-institute-of-technology-madras) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


❌ No saved model found, starting training.
Epoch 1/10 | Train Loss: 1.6193, Train Acc: 7.72%
💾 Saved new best model at epoch 1 with accuracy 7.72%
Epoch 2/10 | Train Loss: 0.7582, Train Acc: 19.03%
💾 Saved new best model at epoch 2 with accuracy 19.03%
Epoch 3/10 | Train Loss: 0.6072, Train Acc: 25.61%
💾 Saved new best model at epoch 3 with accuracy 25.61%
Epoch 4/10 | Train Loss: 0.5297, Train Acc: 28.85%
💾 Saved new best model at epoch 4 with accuracy 28.85%
Epoch 5/10 | Train Loss: 0.4836, Train Acc: 31.31%
💾 Saved new best model at epoch 5 with accuracy 31.31%
Epoch 6/10 | Train Loss: 0.4363, Train Acc: 34.26%
💾 Saved new best model at epoch 6 with accuracy 34.26%
Epoch 7/10 | Train Loss: 0.4121, Train Acc: 34.50%
💾 Saved new best model at epoch 7 with accuracy 34.50%
Epoch 8/10 | Train Loss: 0.3921, Train Acc: 38.35%
💾 Saved new best model at epoch 8 with accuracy 38.35%
Epoch 9/10 | Train Loss: 0.3655, Train Acc: 38.86%
💾 Saved new best model at epoch 9 with accuracy 38.86%
Epoch

0,1
Test Accuracy,▁
Train Accuracy,▁▃▅▆▆▇▇███
Train Loss,█▃▂▂▂▁▁▁▁▁

0,1
Test Accuracy,0.37894
Train Accuracy,40.13608
Train Loss,0.34854


Saved 4502 rows, eval accuracy = 0.38
