In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import numpy as np
import wandb

In [4]:
wandb.login(key='13b86763ab8ddf529c91c7dce385c6cb04b5253e')

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mma23m015[0m ([33miitm-ma23m015[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
train_path = "/kaggle/input/tamil-translit/ta.translit.sampled.train.tsv"
dev_path = "/kaggle/input/tamil-translit/ta.translit.sampled.dev.tsv"
test_path = "/kaggle/input/tamil-translit/ta.translit.sampled.test.tsv"

train_df = pd.read_csv(train_path, sep="\t", header=None, names=["target","source","freq"])
dev_df = pd.read_csv(dev_path, sep="\t", header=None, names=["target","source","freq"])
test_df = pd.read_csv(test_path, sep="\t", header=None, names=["target","source","freq"])

train_df = train_df.dropna(subset=['source','target'])
dev_df = dev_df.dropna(subset=['source','target'])
test_df = test_df.dropna(subset=['source','target'])

train_pairs = [(str(s), str(t)) for s,t in zip(train_df.source, train_df.target)]
dev_pairs = [(str(s), str(t)) for s,t in zip(dev_df.source, dev_df.target)]
test_pairs = [(str(s), str(t)) for s,t in zip(test_df.source, test_df.target)]

In [6]:

class CharVocab:
    def __init__(self, sequences):
        self.char2idx = {'<pad>':0, '<sos>':1, '<eos>':2, '<unk>':3}
        self.idx2char = ['<pad>', '<sos>', '<eos>', '<unk>']
        chars = set(''.join(sequences))
        for ch in sorted(chars):
            self.char2idx[ch] = len(self.idx2char)
            self.idx2char.append(ch)
    def encode(self, text):
        return [self.char2idx.get(c, self.char2idx['<unk>']) for c in text]
    def decode(self, indices):
        result = []
        for idx in indices:
            if idx == self.char2idx['<eos>']:
                break
            if idx not in (self.char2idx['<pad>'], self.char2idx['<sos>']):
                result.append(self.idx2char[idx])
        return ''.join(result)
    def __len__(self):
        return len(self.idx2char)

# Create vocabularies
src_vocab = CharVocab([s for s,_ in train_pairs])
tgt_vocab = CharVocab([t for _,t in train_pairs])


class TransliterationDataset(Dataset):
    def __init__(self, pairs, src_vocab, tgt_vocab):
        self.pairs = pairs
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
    def __len__(self):
        return len(self.pairs)
    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        src_enc = torch.tensor(self.src_vocab.encode(src), dtype=torch.long)
        tgt_enc = torch.tensor([self.tgt_vocab.char2idx['<sos>']] + self.tgt_vocab.encode(tgt) + [self.tgt_vocab.char2idx['<eos>']], dtype=torch.long)
        return src_enc, tgt_enc

def collate_fn(batch):
    src_seqs, tgt_seqs = zip(*batch)
    src_padded = pad_sequence(src_seqs, batch_first=True, padding_value=src_vocab.char2idx['<pad>'])
    tgt_padded = pad_sequence(tgt_seqs, batch_first=True, padding_value=tgt_vocab.char2idx['<pad>'])
    return src_padded, tgt_padded

# Create datasets
train_ds = TransliterationDataset(train_pairs, src_vocab, tgt_vocab)
dev_ds = TransliterationDataset(dev_pairs, src_vocab, tgt_vocab)
test_ds = TransliterationDataset(test_pairs, src_vocab, tgt_vocab)

In [7]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout, cell_type, pad_idx, bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.dropout = nn.Dropout(dropout)
        self.cell_type = cell_type.upper()
        self.n_layers = n_layers
        self.hid_dim = hid_dim
        self.bidirectional = bidirectional
        self.n_directions = 2 if bidirectional else 1
        
        rnn_cls = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[self.cell_type]
        self.rnn = rnn_cls(
            emb_dim, 
            hid_dim, 
            n_layers, 
            batch_first=True, 
            dropout=dropout if n_layers > 1 else 0,
            bidirectional=bidirectional
        )
        
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        
        # Process hidden state based on RNN type
        if self.cell_type == 'LSTM':
            
            h_n, c_n = hidden          
            if self.bidirectional:
 
                h_n = h_n.view(self.n_layers, self.n_directions, -1, self.hid_dim)
                h_n = h_n.sum(dim=1)  # Sum the bidirectional outputs
                
                c_n = c_n.view(self.n_layers, self.n_directions, -1, self.hid_dim)
                c_n = c_n.sum(dim=1)  # Sum the bidirectional outputs
                
            return (h_n, c_n)
        else:
            if self.bidirectional:
                hidden = hidden.view(self.n_layers, self.n_directions, -1, self.hid_dim)
                hidden = hidden.sum(dim=1)  
                
            return hidden

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout, cell_type, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=pad_idx)
        self.dropout = nn.Dropout(dropout)
        self.cell_type = cell_type.upper()
        self.n_layers = n_layers
        self.hid_dim = hid_dim
        
        rnn_cls = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[self.cell_type]
        self.rnn = rnn_cls(
            emb_dim, 
            hid_dim, 
            n_layers, 
            batch_first=True, 
            dropout=dropout if n_layers > 1 else 0
        )
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
    def forward(self, input, hidden):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden


In [9]:
class Seq2Seq(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, emb_dim, hid_dim, enc_layers, dec_layers, dropout, cell_type, device, bidirectional=False):
        super().__init__()
        self.device = device
        self.encoder = Encoder(
            src_vocab_size, 
            emb_dim, 
            hid_dim, 
            enc_layers, 
            dropout, 
            cell_type, 
            pad_idx=src_vocab.char2idx['<pad>'],
            bidirectional=bidirectional
        )
        self.decoder = Decoder(
            tgt_vocab_size, 
            emb_dim, 
            hid_dim, 
            dec_layers, 
            dropout, 
            cell_type, 
            pad_idx=tgt_vocab.char2idx['<pad>']
        )
        self.tgt_vocab_size = tgt_vocab_size
        self.cell_type = cell_type.upper()
    
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        max_len = tgt.size(1)
        outputs = torch.zeros(batch_size, max_len, self.tgt_vocab_size).to(self.device)

        encoder_hidden = self.encoder(src)
        decoder_hidden = encoder_hidden
        
        input = tgt[:, 0]

        for t in range(1, max_len):
            output, decoder_hidden = self.decoder(input, decoder_hidden)
            outputs[:, t] = output
            teacher_force = np.random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = tgt[:, t] if teacher_force else top1
        return outputs

    def beam_search(self, src, beam_width=3, max_len=50):
        batch_size = src.size(0)
        hidden = self.encoder(src)
        decoder_hidden = hidden
        input = torch.tensor([tgt_vocab.char2idx['<sos>']] * batch_size).to(self.device)

        decoded_words = [[] for _ in range(batch_size)]
        for _ in range(max_len):
            output, decoder_hidden = self.decoder(input, decoder_hidden)
            top1 = output.argmax(1)
            input = top1
            for i in range(batch_size):
                decoded_words[i].append(top1[i].item())
        decoded_sentences = [tgt_vocab.decode(seq) for seq in decoded_words]
        return decoded_sentences

In [None]:

def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    for src, tgt in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt)
        output_dim = output.shape[-1]
        output = output[:,1:,:].reshape(-1, output_dim)
        tgt = tgt[:,1:].reshape(-1)
        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

def word_accuracy(model, loader, src_vocab, tgt_vocab, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)
            batch_size = src.size(0)
            outputs = model(src, tgt, teacher_forcing_ratio=0)
            # Get predicted indices: max over vocab dimension
            pred_indices = outputs.argmax(dim=2)  # batch x seq_len
            
            for i in range(batch_size):
                pred_word = tgt_vocab.decode(pred_indices[i].cpu().numpy())
                true_word = tgt_vocab.decode(tgt[i].cpu().numpy())
                if pred_word == true_word:
                    correct += 1
                total += 1
    return correct / total

def evaluate(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt, teacher_forcing_ratio=0)
            output_dim = output.shape[-1]
            output = output[:,1:,:].reshape(-1, output_dim)
            tgt = tgt[:,1:].reshape(-1)
            loss = criterion(output, tgt)
            epoch_loss += loss.item()
    return epoch_loss / len(dataloader)




In [24]:

sweep_config = {
    'method': 'bayes',
    'name': 'sweep - no attention',
    'metric': {
        'goal': 'maximize',
        'name': 'validation_accuracy'
    },
    'parameters': {
        'input_embedding_size': {
            'values': [64, 128]  # 16,32,64,
        },
        'enc_layers': {
            'values': [1, 2, 3]
        },
        'dec_layers': {
            'values': [1, 2, 3]
        },
        'emb_hidden_size': {
            'values': [64, 128, 256]
        },
        'hidden_size': {
            'values': [64, 128, 256]
        },
        'cell_type': {
            'values': ['LSTM', 'RNN', 'GRU']
        },
        'bidirectional': {
            'values': [True,False]
        },
        'dropout': {
            'values': [0.1, 0.2, 0.3]
        },
        'beam_size': {
            'values': [1, 3, 5]
        }
    }
}

sweep_id = wandb.sweep(sweep=sweep_config, project='MA23M015_DL_Assignment3')

Create sweep with ID: hrvozery
Sweep URL: https://wandb.ai/iitm-ma23m015/MA23M015_DL_Assignment3/sweeps/hrvozery


In [None]:
def main():
    with wandb.init() as run:
        config = wandb.config
        
        BATCH_SIZE = 64
        train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
        dev_loader = DataLoader(dev_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
        test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
        
        wandb.run.name = f'cell-{config.cell_type}_emb_hid_sz-{config.emb_hidden_size}_hid_size-{config.hidden_size}_inp_embed-{config.input_embedding_size}_enc-{config.enc_layers}_dec-{config.dec_layers}_dropout-{config.dropout}'
        
        DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        model = Seq2Seq(
            src_vocab_size=len(src_vocab),
            tgt_vocab_size=len(tgt_vocab),
            emb_dim=config.emb_hidden_size,
            hid_dim=config.hidden_size,
            enc_layers=config.enc_layers,
            dec_layers=config.enc_layers,  # note fix here, before you used enc_layers twice
            dropout=config.dropout,
            cell_type=config.cell_type,
            device=DEVICE,
            bidirectional=config.bidirectional
        ).to(DEVICE)
        
        optimizer = torch.optim.Adam(model.parameters())
        criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab.char2idx['<pad>'])
        
        EPOCHS = 2
        for epoch in range(1, EPOCHS + 1):
            train_loss = train_epoch(model, train_loader, optimizer, criterion, DEVICE)
            train_acc = word_accuracy(model, train_loader, src_vocab, tgt_vocab, DEVICE)
            
            val_loss = evaluate(model, dev_loader, criterion, DEVICE)
            val_acc = word_accuracy(model, dev_loader, src_vocab, tgt_vocab, DEVICE)
            
            print(f"Epoch {epoch}: Train Loss={train_loss:.4f} Train Acc={train_acc:.4f} "
                  f"Val Loss={val_loss:.4f} Val Acc={val_acc:.4f}")
            
            wandb.log({
                'Epoch': epoch,
                'train_loss': train_loss,
                'Train_accuracy': train_acc * 100,
                'validation_loss': val_loss,
                'validation_accuracy': val_acc * 100
            })
wandb.agent(sweep_id, function=main, count=1)
wandb.finish()


In [30]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import numpy as np


EMB_DIM = 256           
HID_DIM = 256            
ENC_LAYERS = 3           
DEC_LAYERS = 3          
DROPOUT = 0.3            
CELL_TYPE = 'LSTM'       
BATCH_SIZE = 64
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


train_path = "/kaggle/input/tamil-translit/ta.translit.sampled.train.tsv"
dev_path = "/kaggle/input/tamil-translit/ta.translit.sampled.dev.tsv"
test_path = "/kaggle/input/tamil-translit/ta.translit.sampled.test.tsv"

train_df = pd.read_csv(train_path, sep="\t", header=None, names=["target","source","freq"])
dev_df = pd.read_csv(dev_path, sep="\t", header=None, names=["target","source","freq"])
test_df = pd.read_csv(test_path, sep="\t", header=None, names=["target","source","freq"])

train_df = train_df.dropna(subset=['source','target'])
dev_df = dev_df.dropna(subset=['source','target'])
test_df = test_df.dropna(subset=['source','target'])

train_pairs = [(str(s), str(t)) for s,t in zip(train_df.source, train_df.target)]
dev_pairs = [(str(s), str(t)) for s,t in zip(dev_df.source, dev_df.target)]
test_pairs = [(str(s), str(t)) for s,t in zip(test_df.source, test_df.target)]


class CharVocab:
    def __init__(self, sequences):
        self.char2idx = {'<pad>':0, '<sos>':1, '<eos>':2, '<unk>':3}
        self.idx2char = ['<pad>', '<sos>', '<eos>', '<unk>']
        chars = set(''.join(sequences))
        for ch in sorted(chars):
            self.char2idx[ch] = len(self.idx2char)
            self.idx2char.append(ch)
    def encode(self, text):
        return [self.char2idx.get(c, self.char2idx['<unk>']) for c in text]
    def decode(self, indices):
        result = []
        for idx in indices:
            if idx == self.char2idx['<eos>']:
                break
            if idx not in (self.char2idx['<pad>'], self.char2idx['<sos>']):
                result.append(self.idx2char[idx])
        return ''.join(result)
    def __len__(self):
        return len(self.idx2char)

src_vocab = CharVocab([s for s,_ in train_pairs])
tgt_vocab = CharVocab([t for _,t in train_pairs])

# --- Dataset ---
class TransliterationDataset(Dataset):
    def __init__(self, pairs, src_vocab, tgt_vocab):
        self.pairs = pairs
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
    def __len__(self):
        return len(self.pairs)
    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        src_enc = torch.tensor(self.src_vocab.encode(src), dtype=torch.long)
        tgt_enc = torch.tensor([self.tgt_vocab.char2idx['<sos>']] + self.tgt_vocab.encode(tgt) + [self.tgt_vocab.char2idx['<eos>']], dtype=torch.long)
        return src_enc, tgt_enc

def collate_fn(batch):
    src_seqs, tgt_seqs = zip(*batch)
    src_padded = pad_sequence(src_seqs, batch_first=True, padding_value=src_vocab.char2idx['<pad>'])
    tgt_padded = pad_sequence(tgt_seqs, batch_first=True, padding_value=tgt_vocab.char2idx['<pad>'])
    return src_padded, tgt_padded

train_ds = TransliterationDataset(train_pairs, src_vocab, tgt_vocab)
dev_ds = TransliterationDataset(dev_pairs, src_vocab, tgt_vocab)
test_ds = TransliterationDataset(test_pairs, src_vocab, tgt_vocab)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)



class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout, cell_type, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.dropout = nn.Dropout(dropout)
        self.cell_type = cell_type.upper()
        rnn_cls = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[self.cell_type]
        self.rnn = rnn_cls(emb_dim, hid_dim, n_layers, batch_first=True, dropout=dropout if n_layers > 1 else 0)
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout, cell_type, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=pad_idx)
        self.dropout = nn.Dropout(dropout)
        self.cell_type = cell_type.upper()
        rnn_cls = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[self.cell_type]
        self.rnn = rnn_cls(emb_dim, hid_dim, n_layers, batch_first=True, dropout=dropout if n_layers > 1 else 0)
        self.fc_out = nn.Linear(hid_dim, output_dim)
    def forward(self, input, hidden):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden

class Seq2Seq(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, emb_dim, hid_dim, enc_layers, dec_layers, dropout, cell_type, device):
        super().__init__()
        self.device = device
        self.encoder = Encoder(src_vocab_size, emb_dim, hid_dim, enc_layers, dropout, cell_type, pad_idx=src_vocab.char2idx['<pad>'])
        self.decoder = Decoder(tgt_vocab_size, emb_dim, hid_dim, dec_layers, dropout, cell_type, pad_idx=tgt_vocab.char2idx['<pad>'])
        self.tgt_vocab_size = tgt_vocab_size
        self.cell_type = cell_type.upper()
    
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        max_len = tgt.size(1)
        outputs = torch.zeros(batch_size, max_len, self.tgt_vocab_size).to(self.device)

        hidden = self.encoder(src)
        decoder_hidden = hidden
        
        input = tgt[:, 0]

        for t in range(1, max_len):
            output, decoder_hidden = self.decoder(input, decoder_hidden)
            outputs[:, t] = output
            teacher_force = np.random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = tgt[:, t] if teacher_force else top1
        return outputs

    def beam_search(self, src, beam_width=3, max_len=50):
        # Simple greedy decoder as a placeholder for beam search
        batch_size = src.size(0)
        hidden = self.encoder(src)
        decoder_hidden = hidden
        input = torch.tensor([tgt_vocab.char2idx['<sos>']] * batch_size).to(self.device)

        decoded_words = [[] for _ in range(batch_size)]
        for _ in range(max_len):
            output, decoder_hidden = self.decoder(input, decoder_hidden)
            top1 = output.argmax(1)
            input = top1
            for i in range(batch_size):
                decoded_words[i].append(top1[i].item())
        decoded_sentences = [tgt_vocab.decode(seq) for seq in decoded_words]
        return decoded_sentences


def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    for src, tgt in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt)
        output_dim = output.shape[-1]
        output = output[:,1:,:].reshape(-1, output_dim)
        tgt = tgt[:,1:].reshape(-1)
        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

def word_accuracy(model, loader, src_vocab, tgt_vocab, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)
            batch_size = src.size(0)
            outputs = model(src, tgt, teacher_forcing_ratio=0)
            # Get predicted indices: max over vocab dimension
            pred_indices = outputs.argmax(dim=2)  # batch x seq_len
            
            for i in range(batch_size):
                pred_word = tgt_vocab.decode(pred_indices[i].cpu().numpy())
                true_word = tgt_vocab.decode(tgt[i].cpu().numpy())
                if pred_word == true_word:
                    correct += 1
                total += 1
    return correct / total


def evaluate(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt, teacher_forcing_ratio=0)
            output_dim = output.shape[-1]
            output = output[:,1:,:].reshape(-1, output_dim)
            tgt = tgt[:,1:].reshape(-1)
            loss = criterion(output, tgt)
            epoch_loss += loss.item()
    return epoch_loss / len(dataloader)


model = Seq2Seq(len(src_vocab), len(tgt_vocab), EMB_DIM, HID_DIM, ENC_LAYERS, DEC_LAYERS, DROPOUT, CELL_TYPE, DEVICE).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab.char2idx['<pad>'])


EPOCHS = 5
for epoch in range(1, EPOCHS+1):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, DEVICE)
    train_acc = word_accuracy(model, train_loader, src_vocab, tgt_vocab,DEVICE)  # train accuracy
    
    val_loss = evaluate(model, dev_loader, criterion, DEVICE)
    val_acc = word_accuracy(model, dev_loader, src_vocab, tgt_vocab,DEVICE)      # val accuracy
    
    print(f"Epoch {epoch}: Train Loss={train_loss:.4f} Train Acc={train_acc:.4f} "
          f"Val Loss={val_loss:.4f} Val Acc={val_acc:.4f}")

import pandas as pd
from pathlib import Path
from tqdm import tqdm  

def collect_predictions(model, dataloader, src_vocab, tgt_vocab, device):
    model.eval()
    out = []

    with torch.no_grad():
        for src, tgt in tqdm(dataloader, desc="Predicting"):
            src, tgt = src.to(device), tgt.to(device)
            outputs = model(src, tgt, teacher_forcing_ratio=0)        # (batch, seq, vocab)
            pred_ids = outputs.argmax(dim=2).cpu().numpy()            # np array

            for i in range(src.size(0)):
                translit = src_vocab.decode(src[i].cpu().numpy())
                true_w   = tgt_vocab.decode(tgt[i].cpu().numpy())
                pred_w   = tgt_vocab.decode(pred_ids[i])
                out.append((translit, true_w, pred_w))

    return out


pred_tuples = collect_predictions(model, test_loader, src_vocab, tgt_vocab, DEVICE)


df = pd.DataFrame(pred_tuples,
                  columns=["Transliteration", "trueword", "predicted_word"])

out_file = Path("test_predictions.csv")
df.to_csv(out_file, index=False, encoding="utf-8-sig")
print(f"\n Saved full results to {out_file.resolve()}")


test_acc = (df["trueword"] == df["predicted_word"]).mean()
print(f"\nWord‑level **test accuracy**: {test_acc:.4f}")


print("\n── Last 100 predictions ──")
print(df.tail(100).to_string(index=False))


Epoch 1: Train Loss=1.8904 Train Acc=0.2765 Val Loss=1.1854 Val Acc=0.2587
Epoch 2: Train Loss=0.6511 Train Acc=0.5545 Val Loss=0.8765 Val Acc=0.4615
Epoch 3: Train Loss=0.4434 Train Acc=0.6654 Val Loss=0.7768 Val Acc=0.5093
Epoch 4: Train Loss=0.3491 Train Acc=0.7275 Val Loss=0.7613 Val Acc=0.5382
Epoch 5: Train Loss=0.2930 Train Acc=0.7724 Val Loss=0.7430 Val Acc=0.5406


Predicting: 100%|██████████| 108/108 [00:01<00:00, 70.81it/s]


 Saved full results to /kaggle/working/test_predictions.csv

Word‑level **test accuracy**: 0.5347

── Last 100 predictions ──
  Transliteration       trueword predicted_word
     vaelaikkaana      வேலைக்கான      வேலைக்கான
       vaelaikkup     வேலைக்குப்     வேலைக்குப்
       vaelaikkum     வேலைக்கும்     வேலைக்கும்
         vaelaich         வேலைச்         வேலைச்
       paarppavar         வேலைப்     பார்ப்பவர்
        paarththu         வேலைப்       பார்த்து
        parppavar         வேலைப்        பற்பவர்
      vaelaiyaaga        வேலையாக        வேலையாக
      vaelaiyaaka        வேலையாக        வேலையாக
      vealaiyaaka        வேலையாக        வேலையாக
        vaelaiyin       வேலையின்       வேலையின்
        vealaiyin       வேலையின்       வேலையின்
        vaelaiyil       வேலையில்       வேலையில்
        vealaiyil       வேலையில்       வேலையில்
     vaelaiyaiyum     வேலையையும்     வேலையையும்
     vealaiyaiyum     வேலையையும்     வேலையையும்
              vai             வை             வை
      vai


