## Importing Libraries and models

In [1]:
%%capture
!pip install wandb

In [2]:
!nvidia-smi

Thu Oct 16 13:58:31 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.5     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:17:00.0 Off |                    0 |
| N/A   58C    P0             132W / 300W |  12488MiB / 81920MiB |     17%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off | 00000000:31:00.0 Off |  

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import wandb
import random
import pandas as pd
import torch
import time
import numpy as np
import torch.nn as nn
from torch import optim
import matplotlib.pyplot as plt
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

[1760645402.556510] [bfd74565809f:110106:f]        vfs_fuse.c:281  UCX  ERROR inotify_add_watch(/tmp) failed: No space left on device


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()
print(device)

cuda


## Load Dataset

In [3]:
class Language:
    def __init__(self, name):
        self.name = name
        self.char2index = {'#': 0, '$': 1, '^': 2}   # '^': start of sequence, '$' : unknown char, '#' : padding
        self.index2char = {0: '#', 1: '$', 2: '^'}
        self.vocab_size = 3  # Count

    def addWord(self, word):
        for char in word:
            self.addChar(char)

    def addChar(self, char):
        if char not in self.char2index:
            self.char2index[char] = self.vocab_size
            self.index2char[self.vocab_size] = char
            self.vocab_size += 1

    def encode(self, s):
        return [self.char2index[ch] for ch in s]

    def decode(self, l):
        return ''.join([self.index2char[i] for i in l])

    def vocab(self):
        return self.char2index.keys()


In [4]:
# returns maximum length of input and output words
def maxLength(data):
    ip_mlen, op_mlen = 0, 0

    for i in range(len(data)):
        input = data[0][i]
        output = data[1][i]
        if(len(input)>ip_mlen):
            ip_mlen=len(input)

        if(len(output)>op_mlen):
            op_mlen=len(output)

    return ip_mlen, op_mlen

In [5]:
import pandas as pd

def getMaxLengthValues(lang):
    base_path = f"../../aks_dataset/{lang}"
    
    # Load datasets
    train_df = pd.read_csv(f"{base_path}/train.csv", header=None)
    val_df = pd.read_csv(f"{base_path}/valid.csv", header=None)
    test_df = pd.read_csv(f"{base_path}/test.csv", header=None)

    # Initialize language vocabularies
    input_lang = Language('eng')
    output_lang = Language(lang)
    
    # Build vocabulary only from train data
    for _, row in train_df.iterrows():
        input_lang.addWord(str(row[0]))
        output_lang.addWord(str(row[1]))
    
    # Compute max input/output lengths for each split
    m1, m01 = maxLength(train_df)
    m2, m02 = maxLength(test_df)
    m3, m03 = maxLength(val_df)

    # Return the largest values across all splits
    return max(m1, m2, m3), max(m01, m02, m03)

# Example usage
input_max_len, output_max_len = getMaxLengthValues('hin')
print(input_max_len, output_max_len)

29 26


In [6]:
input_shape = 0
def preprocess(data, input_lang, output_lang, input_max_len, output_max_len, s=''):

    unknown = input_lang.char2index['$']

    n = len(data)
    input = torch.zeros((n, input_max_len + 1), device = device)
    output = torch.zeros((n, output_max_len + 2), device = device)

    for i in range(n):

        inp = data[0][i].ljust(input_max_len + 1, '#')
        op = '^' + data[1][i]       # add start symbol to output
        op = op.ljust(output_max_len + 2, '#')

        for index, char in enumerate(inp):
            if char in input_lang.char2index:
                input[i][index] = input_lang.char2index[char]
            else:
                input[i][index] = unknown

        for index, char in enumerate(op):
            if char in output_lang.char2index:
                output[i][index] = output_lang.char2index[char]
            else:
                output[i][index] = unknown

    print(s, ' dataset')
    print(input.shape)
    print(output.shape)

    return TensorDataset(input.to(torch.int32), output.to(torch.int32))

In [7]:
def load_prepare_data(lang):
    train_df = pd.read_csv(f"../../aks_dataset/{lang}/train.csv", header = None)
    val_df = pd.read_csv(f"../../aks_dataset/{lang}/valid.csv", header = None)
    test_df = pd.read_csv(f"../../aks_dataset/{lang}/test.csv", header = None)

    input_lang = Language('eng')
    output_lang = Language(lang)

    # create vocablury
    for i in range(len(train_df)):
        input_lang.addWord(train_df[0][i]) # 'eng'
        output_lang.addWord(train_df[1][i]) # 'hin'

    # encode the datasets
    train_data = preprocess(train_df, input_lang, output_lang,input_max_len, output_max_len, 'train')
    val_data = preprocess(val_df, input_lang, output_lang,input_max_len, output_max_len, 'validation')
    test_data = preprocess(test_df, input_lang, output_lang,input_max_len, output_max_len, 'test')

    return train_data, val_data, test_data, input_lang, output_lang


train_data, val_data, test_data, input_lang, output_lang = load_prepare_data('hin')

train  dataset
torch.Size([100000, 30])
torch.Size([100000, 28])
validation  dataset
torch.Size([6357, 30])
torch.Size([6357, 28])
test  dataset
torch.Size([10112, 30])
torch.Size([10112, 28])


In [8]:
print(input_lang.decode(train_data[23][0].tolist()))
output_lang.decode(train_data[23][1].tolist())

urjapurna#####################


'^उर्जापूर्ण#################'

In [9]:
train_data[23][1]

tensor([ 2, 42, 15, 18,  8, 12,  3, 29, 15, 18, 43,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0], device='cuda:0',
       dtype=torch.int32)

In [13]:
with open('api_key.txt') as input_file:
    key = input_file.read()
key = key.split('\n')[0].strip()

In [14]:
wandb.login(key =key)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msshejole132[0m ([33msshejole132-iit-bombay[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# seq2seq tranformer model

### hyperparameter settings

In [15]:
n_embd = 64
batch_size = 256
learning_rate = 1e-3
n_head = 4 # other options factors of 32 like 2, 8
n_layers = 6
dropout = 0.2
epochs = 50

# encoder specific detail
input_vocab_size = input_lang.vocab_size
encoder_block_size = len(train_data[0][0])

# decoder specific detail
output_vocab_size = output_lang.vocab_size
decoder_block_size = len(train_data[0][1])

### Encoder model

In [10]:
class Head(nn.Module):
    """ one self-attention head """

    def __init__(self, n_embd, d_k, dropout, mask=0): # d_k is dimention of key , nomaly d_k = n_embd / 4
        super().__init__()
        self.mask = mask
        self.key = nn.Linear(n_embd, d_k, bias=False, device=device)
        self.query = nn.Linear(n_embd, d_k, bias=False, device=device)
        self.value = nn.Linear(n_embd, d_k, bias=False, device=device)
        if mask:
            self.register_buffer('tril', torch.tril(torch.ones(encoder_block_size, encoder_block_size, device=device)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, encoder_output = None):
        B,T,C = x.shape

        if encoder_output is not None:
            k = self.key(encoder_output)
            Be, Te, Ce = encoder_output.shape
        else:
            k = self.key(x) # (B,T,d_k)

        q = self.query(x) # (B,T,d_k)
        # compute attention scores
        wei = q @ k.transpose(-2, -1) * C**-0.5 # (B,T,T)

        if self.mask:
            if encoder_output is not None:
                wei = wei.masked_fill(self.tril[:T, :Te] == 0, float('-inf')) # (B,T,T)
            else:
                wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B,T,T)

        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        # perform weighted aggregation of values
        if encoder_output is not None:
            v = self.value(encoder_output)
        else:
            v = self.value(x)
        out = wei @ v # (B,T,C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple self attention heads in parallel """

    def __init__(self, n_embd, num_head, d_k, dropout, mask=0):
        super().__init__()
        self.heads = nn.ModuleList([Head(n_embd, d_k, dropout, mask) for _ in range(num_head)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, encoder_output=None):
        out = torch.cat([h(x, encoder_output) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    """ multiple self attention heads in parallel """

    def __init__(self, n_embd, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class encoderBlock(nn.Module):
    """ Tranformer encoder block : communication followed by computation """

    def __init__(self, n_embd, n_head, dropout):
        super().__init__()
        d_k = n_embd // n_head
        self.sa = MultiHeadAttention(n_embd, n_head, d_k, dropout)
        self.ffwd = FeedForward(n_embd, dropout)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x, encoder_output=None):
        x = x + self.sa(self.ln1(x), encoder_output)
        x = x + self.ffwd(self.ln2(x))
        return x

class Encoder(nn.Module):

    def __init__(self, n_embd, n_head, n_layers, dropout):
        super().__init__()

        self.token_embedding_table = nn.Embedding(input_vocab_size, n_embd) # n_embd: input embedding dimension
        self.position_embedding_table = nn.Embedding(encoder_block_size, n_embd)
        self.blocks = nn.Sequential(*[encoderBlock(n_embd, n_head, dropout) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm

    def forward(self, idx):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx) # (B,T,n_embd)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,n_embd)
        x = tok_emb + pos_emb # (B,T,n_embd)
        x = self.blocks(x) # apply one attention layer (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        return x


### Decoder model

In [17]:
class decoderBlock(nn.Module):
    """ Tranformer decoder block : self communication then cross communication followed by computation """

    def __init__(self, n_embd, n_head, dropout):
        super().__init__()
        d_k = n_embd // n_head
        self.sa = MultiHeadAttention(n_embd, n_head, d_k, dropout, mask = 1)
        self.ca = MultiHeadAttention(n_embd, n_head, d_k, dropout, mask = 1)
        self.ffwd = FeedForward(n_embd, dropout)
        self.ln1 = nn.LayerNorm(n_embd, device=device)
        self.ln2 = nn.LayerNorm(n_embd, device=device)
        self.ln3 = nn.LayerNorm(n_embd, device=device)

    def forward(self, x_encoder_output):
        x = x_encoder_output[0]
        encoder_output = x_encoder_output[1]
        x = x + self.sa(self.ln1(x))
        x = x + self.ca(self.ln2(x), encoder_output)
        x = x + self.ffwd(self.ln3(x))
        return (x,encoder_output)

class Decoder(nn.Module):

    def __init__(self, n_embd, n_head, n_layers, dropout):
        super().__init__()

        self.token_embedding_table = nn.Embedding(output_vocab_size, n_embd) # n_embd: input embedding dimension
        self.position_embedding_table = nn.Embedding(decoder_block_size, n_embd)
        self.blocks = nn.Sequential(*[decoderBlock(n_embd, n_head=n_head, dropout=dropout) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, output_vocab_size)

    def forward(self, idx, encoder_output, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx) # (B,T,n_embd)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,n_embd)
        x = tok_emb + pos_emb # (B,T,n_embd)

        x =self.blocks((x, encoder_output))
        x = self.ln_f(x[0]) # (B,T,C)
        logits = self.lm_head(x) # (B,T,output_vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            temp_logits = logits.view(B*T, C)
            targets = targets.reshape(B*T)

            loss = F.cross_entropy(temp_logits, targets.long())

        # print(logits)
        # out = torch.argmax(logits)

        return logits, loss



# Training Time

## sweep config

In [18]:
# Define sweep config
sweep_configuration = {
    "method": "bayes",
    "name": "sweep",
    "metric": {"goal": "maximize", "name": "val_acc"},
    "parameters": {
        "batch_size": {"values": [64, 128, 256]},
        "epochs": {"values": [20, 40, 50, 100]},
        "lr": {"max": 0.1, "min": 0.0001},
        "n_embd": {"values": [16, 32, 64]},
        "n_head": {"values": [2, 4, 8]},
        "n_layers": {"values": [2]},
        "dropout": {"values": [0, .1, .2, .3]}
    },
}

sweep_id = wandb.sweep(sweep=sweep_configuration, project="Tranliteration-Tranformers")

Create sweep with ID: olbj21xb
Sweep URL: https://wandb.ai/sshejole132-iit-bombay/Tranliteration-Tranformers/sweeps/olbj21xb


In [19]:
# wandb.sweep_cancel(sweep_id)
# wandb.finish()
# wandb.run.cancel()

## train function

In [20]:
from tqdm import tqdm
def train():
    run = wandb.init()

    n_embd = wandb.config.n_embd
    n_head = wandb.config.n_head
    n_layers = wandb.config.n_layers
    dropout = wandb.config.dropout
    epochs = wandb.config.epochs
    batch_size = wandb.config.batch_size
    learning_rate = wandb.config.lr


    encoder = Encoder(n_embd, n_head, n_layers, dropout)
    decoder = Decoder(n_embd, n_head, n_layers, dropout)
    encoder.to(device)
    decoder.to(device)

    train_losses, train_accuracies, val_losses, val_accuracies = [], [], [], []

    # print the number of parameters in the model
    print(sum([p.numel() for p in encoder.parameters()] + [p.numel() for p in decoder.parameters()])/1e3, 'K model parameters')

    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)

    # create a PyTorch optimizer
    encoder_optimizer = torch.optim.AdamW(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.AdamW(decoder.parameters(), lr=learning_rate)

# print('Step | Training Loss | Validation Loss   |   Training Accuracy %  |  Validation Accuracy %')

    least_error = float('inf')
    patience = 20  # The number of epochs without improvement to wait before stopping
    no_improvement = 0

    for i in tqdm(range(epochs)):
        running_loss = 0.0
        train_correct = 0

        encoder.train()
        decoder.train()

        for j,(train_x,train_y) in enumerate(train_loader):
            train_x = train_x.to(device)
            train_y = train_y.to(device)

            encoder_optimizer.zero_grad(set_to_none=True)
            decoder_optimizer.zero_grad(set_to_none=True)

            encoder_output = encoder(train_x)
            logits, loss = decoder(train_y[:, :-1], encoder_output, train_y[:, 1:])

            encoder_optimizer.zero_grad(set_to_none=True)
            decoder_optimizer.zero_grad(set_to_none=True)
            loss.backward()
            encoder_optimizer.step()
            decoder_optimizer.step()

            running_loss += loss
            pred_decoder_output = torch.argmax(logits, dim=-1)
            # print(pred_decoder_output, " target: ", train_y[:, 1:])
            train_correct += (pred_decoder_output == train_y[:, 1:]).sum().item()


        ## validation code
        running_loss_val, val_correct = 0, 0
        encoder.eval()
        decoder.eval()
        for j,(val_x,val_y) in enumerate(val_loader):
            val_x = val_x.to(device)
            val_y = val_y.to(device)

            encoder_output = encoder(val_x)
            logits, loss = decoder(val_y[:, :-1], encoder_output, val_y[:, 1:])

            running_loss_val += loss
            pred_decoder_output = torch.argmax(logits, dim=-1)
            val_correct += torch.sum(pred_decoder_output == val_y[:, 1:])


        if running_loss_val < least_error:
            least_error = running_loss_val
            no_improvement = 0
        else:
            no_improvement += 1

        if no_improvement >= patience:
            print(f"Early stopping at epoch {i}")
            break

        wandb.log(
            {
                "train_loss": running_loss / len(train_data),
                "val_loss": (running_loss_val/len(val_data)),
                "train_acc": ((train_correct*100) / (len(train_data)* (decoder_block_size-1))),
                "val_acc": ((val_correct*100)/(len(val_data)* (decoder_block_size-1))),
            }
        )

## run sweep

In [None]:
wandb.agent(sweep_id=sweep_id, function=train)

[34m[1mwandb[0m: Agent Starting Run: 3o6074mu with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	lr: 0.06495675751598949
[34m[1mwandb[0m: 	n_embd: 64
[34m[1mwandb[0m: 	n_head: 4
[34m[1mwandb[0m: 	n_layers: 2


246.787 K model parameters


 72%|███████████████████████████████████████████████████████████▊                       | 36/50 [24:27<09:30, 40.78s/it]

Early stopping at epoch 36





0,1
train_acc,▁▂▃▄▄▅▅▄▄▄▄▄▅▆▂▄▇▇▆▅▄▅▃▃▃▅▃▃▃▃▄▄▅▅▇█
train_loss,█▅▅▄▄▃▃▄▃▅▄▄▄▂█▅▂▁▃▃▃▃▅▄▄▃▆▅▅▅▄▄▃▃▂▁
val_acc,▂▂▃▄▅▄▂▄▃▄▄▄▅▆▃▄█▅▄▂▃▁▁▃▃▅▂▃▃▂▃▃▆▆▆▆
val_loss,▇▆▅▅▄▅▆▆▅▅▅▄▄▃█▄▁▄▅▅▅▇▇▅▅▄█▆▆▆▅▆▃▃▄▂

0,1
train_acc,75.60189
train_loss,0.00719
val_acc,79.19354
val_loss,0.00601


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: f2qrfdvz with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	lr: 0.06325632562397139
[34m[1mwandb[0m: 	n_embd: 32
[34m[1mwandb[0m: 	n_head: 8
[34m[1mwandb[0m: 	n_layers: 2


66.083 K model parameters


 56%|█████████████████████████████████████████████▉                                    | 56/100 [54:35<42:53, 58.49s/it]

Early stopping at epoch 56





0,1
train_acc,▂▃▄▅▆▃▁▂▄▄▆▅▃▂▃▄▄▂▃▂▇▅▂▂▂▆█▇▄▅▅▅▂▃▄▆▆▆▅▁
train_loss,▇▅▅▄▄█▆▅▄▃▃▆▇▆▅▄▇▆▆▅▂▆▆▇▃▁▁▂▄▄▅▄▇▅▄▃▃▃▄▇
val_acc,▄▃▅▆▄▆▄▂▄▅▅▆▂▂▃▄▄▁▁▄▆▃▄▄▅▇█▇▅█▅▇▆▂▆▅▇▇█▃
val_loss,▆▆▅▅▆▆▇▆▅▅▅▇▇▆▆▅▇█▆▅▅▅▅▅▄▁▁▃▅▄▃▇▃▆▅▃▃▃▆▇

0,1
train_acc,73.65207
train_loss,0.00762
val_acc,77.01338
val_loss,0.00657


[34m[1mwandb[0m: Agent Starting Run: 7t4l11zq with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	lr: 0.06567673788630166
[34m[1mwandb[0m: 	n_embd: 64
[34m[1mwandb[0m: 	n_head: 2
[34m[1mwandb[0m: 	n_layers: 2


246.787 K model parameters


 54%|████████████████████████████████████████████▎                                     | 54/100 [10:11<08:40, 11.32s/it]

Early stopping at epoch 54





0,1
train_acc,▁▄▄▃▂▅▄▄▅▄▃▃▄▄▃▄▅▆▇▇▇▂▃▅▆▇█▇▆▇▆▅▅▄▄▅▅▆▆▆
train_loss,█▅▇▆▆▄▄▄▅▅▆▄▅▄▅▄▃▂▁▂▂▆▆▅▃▂▂▁▂▃▃▃▃▅▄▃▂▂▂▃
val_acc,▃▄▃▅▄▅▆▃▅▅▂▄▂▄▄▄▄▅▅▆▅▆▁▂▅▆▇█▆▆▄▅▄▅▄▅▄▅▇▆
val_loss,▆█▅▅▆▃▅▄▄█▆▅▆▅▅▅▃▃▃▃▇▆▄▃▃▂▁▂▃▂▄▄▅▄▅▃▃▂▂▃

0,1
train_acc,73.68726
train_loss,0.00389
val_acc,77.90537
val_loss,0.00325


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: sprn15hj with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	lr: 0.02257347420160668
[34m[1mwandb[0m: 	n_embd: 64
[34m[1mwandb[0m: 	n_head: 8
[34m[1mwandb[0m: 	n_layers: 2


246.787 K model parameters


100%|███████████████████████████████████████████████████████████████████████████████████| 50/50 [21:59<00:00, 26.40s/it]


0,1
train_acc,▁▄▆▆▇▇▇▇▇▇██████████████████████████████
train_loss,█▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▅▆▆▇▇▇▇▇▇██████████████████████████████
val_loss,█▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
train_acc,95.55844
train_loss,0.00051
val_acc,96.42797
val_loss,0.00043


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: dgyvqndu with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	epochs: 40
[34m[1mwandb[0m: 	lr: 0.0286256627835069
[34m[1mwandb[0m: 	n_embd: 64
[34m[1mwandb[0m: 	n_head: 8
[34m[1mwandb[0m: 	n_layers: 2


246.787 K model parameters


100%|███████████████████████████████████████████████████████████████████████████████████| 40/40 [17:46<00:00, 26.65s/it]


0,1
train_acc,▁▃▄▄▄▄▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇████████████████
train_loss,█▆▅▅▅▅▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▃▃▄▄▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇███████████▇█▇▇
val_loss,█▆▆▅▅▅▅▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂

0,1
train_acc,87.06459
train_loss,0.00166
val_acc,91.00903
val_loss,0.0011


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 4mw8e64o with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	lr: 0.011861753734323697
[34m[1mwandb[0m: 	n_embd: 64
[34m[1mwandb[0m: 	n_head: 8
[34m[1mwandb[0m: 	n_layers: 2


246.787 K model parameters


100%|███████████████████████████████████████████████████████████████████████████████████| 50/50 [22:43<00:00, 27.27s/it]


0,1
train_acc,▁▆▇▇▇▇▇▇████████████████████████████████
train_loss,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▄▅▆▆▆▇▆▇▇▇▇▇▇▇▇███████▇████████████████
val_loss,█▅▄▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
train_acc,96.10641
train_loss,0.00044
val_acc,96.68374
val_loss,0.0004


[34m[1mwandb[0m: Agent Starting Run: cgkm9666 with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	epochs: 40
[34m[1mwandb[0m: 	lr: 0.007296185855534168
[34m[1mwandb[0m: 	n_embd: 64
[34m[1mwandb[0m: 	n_head: 8
[34m[1mwandb[0m: 	n_layers: 2


246.787 K model parameters


100%|███████████████████████████████████████████████████████████████████████████████████| 40/40 [18:48<00:00, 28.22s/it]


0,1
train_acc,▁▆▇▇▇▇▇▇▇▇▇▇████████████████████████████
train_loss,█▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▄▅▆▆▆▆▇▆▇▇▇▇▇▇█▇▇▇▇█▇████▇▇███████▇████
val_loss,█▅▄▃▃▂▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂

0,1
train_acc,97.40063
train_loss,0.00029
val_acc,96.85211
val_loss,0.0004


[34m[1mwandb[0m: Agent Starting Run: x7lyojxw with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	epochs: 40
[34m[1mwandb[0m: 	lr: 0.015215530714441816
[34m[1mwandb[0m: 	n_embd: 64
[34m[1mwandb[0m: 	n_head: 8
[34m[1mwandb[0m: 	n_layers: 2


246.787 K model parameters


100%|███████████████████████████████████████████████████████████████████████████████████| 40/40 [16:11<00:00, 24.28s/it]


0,1
train_acc,▁▆▇▇▇███████████████████████████████████
train_loss,█▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▅▆▆▇▇▇▇▇▇██▇█▇█████████████████████████
val_loss,█▃▃▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
train_acc,96.97322
train_loss,0.00034
val_acc,96.7822
val_loss,0.0004


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ol3n0jph with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	lr: 0.0023829004625305324
[34m[1mwandb[0m: 	n_embd: 64
[34m[1mwandb[0m: 	n_head: 8
[34m[1mwandb[0m: 	n_layers: 2


246.787 K model parameters


100%|███████████████████████████████████████████████████████████████████████████████████| 20/20 [07:45<00:00, 23.27s/it]


0,1
train_acc,▁▆▇▇▇▇▇█████████████
train_loss,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▅▆▆▇▇▇▇▇▇██████████
val_loss,█▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁

0,1
train_acc,97.05419
train_loss,0.00033
val_acc,96.65926
val_loss,0.00042


[34m[1mwandb[0m: Agent Starting Run: 86triun7 with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	epochs: 40
[34m[1mwandb[0m: 	lr: 0.030047885208825384
[34m[1mwandb[0m: 	n_embd: 64
[34m[1mwandb[0m: 	n_head: 8
[34m[1mwandb[0m: 	n_layers: 2


246.787 K model parameters


100%|███████████████████████████████████████████████████████████████████████████████████| 40/40 [19:19<00:00, 28.99s/it]


0,1
train_acc,▁▃▃▃▃▃▃▃▄▄▄▄▅▄▄▅▅▅▅▅▅▆▆▇▇▇▇█████▇▅▄▄▅▅▅▅
train_loss,█▆▆▆▅▆▅▆▅▅▅▅▄▅▄▄▄▄▄▄▃▃▂▂▂▂▁▁▁▁▁▁▂▃▄▄▄▄▄▄
val_acc,▁▂▂▂▂▂▂▂▃▃▃▄▄▃▄▄▄▄▄▅▅▆▆▇▇▇█▇███▇▇▄▄▄▄▄▄▄
val_loss,█▇▆▆▆▇▆▆▆▆▅▅▅▅▅▅▅▄▅▄▃▃▂▂▂▂▁▁▁▁▁▂▂▅▅▅▄▄▄▅

0,1
train_acc,83.87774
train_loss,0.00216
val_acc,86.87244
val_loss,0.00176


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ay9in1mq with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	lr: 0.0006086129056137431
[34m[1mwandb[0m: 	n_embd: 64
[34m[1mwandb[0m: 	n_head: 8
[34m[1mwandb[0m: 	n_layers: 2


246.787 K model parameters


 66%|██████████████████████████████████████████████████████▊                            | 33/50 [14:26<07:13, 25.50s/it]

# Test Time
Since this is the best model(validation accuracy) , we will train it on both train and validation data.
We will then test the model on test data

## Best Hyperparameter from validation

In [11]:
n_embd = 64
batch_size = 256
learning_rate = 0.003699
n_head = 4
n_layers = 6
dropout = 0
epochs = 20

encoder = Encoder(n_embd, n_head, n_layers, dropout)
decoder = Decoder(n_embd, n_head, n_layers, dropout)
encoder.to(device)
decoder.to(device)
print("✅ Hyperparameters set successfully")

NameError: name 'input_vocab_size' is not defined

## Train on train_data + val_data

In [None]:

# print the number of parameters in the model
print(sum([p.numel() for p in encoder.parameters()] + [p.numel() for p in decoder.parameters()])/1e3, 'K model parameters')

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)

# create a PyTorch optimizer
encoder_optimizer = torch.optim.AdamW(encoder.parameters(), lr=learning_rate)
decoder_optimizer = torch.optim.AdamW(decoder.parameters(), lr=learning_rate)

# print('Step | Training Loss | Validation Loss   |   Training Accuracy %  |  Validation Accuracy %')

least_error = float('inf')
patience = 20  # The number of epochs without improvement to wait before stopping
no_improvement = 0

for i in range(epochs):
    running_loss = 0.0
    train_correct = 0

    encoder.train()
    decoder.train()

    for j,(train_x,train_y) in enumerate(train_loader):
        train_x = train_x.to(device)
        train_y = train_y.to(device)

        encoder_optimizer.zero_grad(set_to_none=True)
        decoder_optimizer.zero_grad(set_to_none=True)

        encoder_output = encoder(train_x)
        logits, loss = decoder(train_y[:, :-1], encoder_output, train_y[:, 1:])

        encoder_optimizer.zero_grad(set_to_none=True)
        decoder_optimizer.zero_grad(set_to_none=True)
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()

        running_loss += loss
        pred_decoder_output = torch.argmax(logits, dim=-1)
        # print(pred_decoder_output, " target: ", train_y[:, 1:])
        train_correct += (pred_decoder_output == train_y[:, 1:]).sum().item()

    for j,(train_x,train_y) in enumerate(val_loader):
        train_x = train_x.to(device)
        train_y = train_y.to(device)

        encoder_optimizer.zero_grad(set_to_none=True)
        decoder_optimizer.zero_grad(set_to_none=True)

        encoder_output = encoder(train_x)
        logits, loss = decoder(train_y[:, :-1], encoder_output, train_y[:, 1:])

        encoder_optimizer.zero_grad(set_to_none=True)
        decoder_optimizer.zero_grad(set_to_none=True)
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()

        running_loss += loss
        pred_decoder_output = torch.argmax(logits, dim=-1)
        # print(pred_decoder_output, " target: ", train_y[:, 1:])
        train_correct += (pred_decoder_output == train_y[:, 1:]).sum().item()


    metrics = {
            "train_loss": running_loss.cpu().detach().numpy() / (len(train_data)+len(val_data)),
            "train_acc": ((train_correct*100) / ((len(train_data)+len(val_data))* (decoder_block_size-1))),
        }
    if i % 5 == 0:
        print("Step: ",i)
        print("train_loss: ", metrics["train_loss"])
        print("train_acc: ", metrics["train_acc"])

In [28]:
PATH = 'models/transformer6-layer-encoder.pth'
torch.save(encoder, PATH)
PATH = 'models/transformer-6-layer-decoder.pth'
torch.save(decoder, PATH)

## generate output sequence

In [29]:
def generate(input):
    B, T = input.shape
    encoder_output = encoder(input)
    idx = torch.full((B, 1), 2, dtype=torch.long, device=device) # (B,1)

    # idx is (B, T) array of indices in the current context
    for _ in range(decoder_block_size-1):
        # get the predictions
        logits, loss = decoder(idx, encoder_output) # logits (B, T, vocab_size)
        # focus only on the last time step
        logits = logits[:, -1, :] # becomes (B, C)
        # apply softmax to get probabilities
        idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (B, 1)
        # append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

## Check Test Accuracy

In [12]:
def check():
## validation code
    running_loss_val, val_correct = 0, 0
    encoder.eval()
    decoder.eval()
    test_loader = DataLoader(test_data, batch_size=64, shuffle=True)
    for _ in range(50):
        val_x,val_y = next(iter(test_loader))

        val_x = val_x.to(device)
        val_y = val_y.to(device)

        output = generate(val_x)

        encoder_output = encoder(val_x)
        logits, loss = decoder(val_y[:, :-1], encoder_output, val_y[:, 1:])

        running_loss_val += loss
        # checking val_correct for the whole sequence
        val_correct += torch.sum(torch.sum(output[:, 1:] != val_y[:, 1:], dim=-1) == 0)

    print("test accuracy(word level) : ", ((val_correct.cpu().detach().numpy()*100) / len(test_data)))

check()

NameError: name 'encoder' is not defined

# Plotting the Attention HeatMaps

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.font_manager import FontProperties
tel_font = FontProperties(fname = 'TiroDevanagariHindi-Regular.ttf')
# Assuming you have attention_weights of shape (batch_size, output_sequence_length, batch_size, input_sequence_length)
# and prediction_matrix of shape (batch_size, output_sequence_length)
# and input_matrix of shape (batch_size, input_sequence_length)

# Define the grid dimensions
rows = int(np.ceil(np.sqrt(12)))
cols = int(np.ceil(12 / rows))

# Create a figure and subplots
fig, axes = plt.subplots(rows, cols, figsize=(9, 9))

for i, ax in enumerate(axes.flatten()):
    if i < 12:
        prediction = [opLang.index2char[j.item()] for j in pred[i+1]]

        pred_word=""
        input_word=""

        for j in range(len(prediction)):
            # Ignore padding
            if(prediction[j] != '#'):
                pred_word += prediction[j]
            else :
                break
        input_seq = [ipLang.index2char[j.item()] for j in testData[i][0]]

        for j in range(len(input_seq)):
            if(input_seq[j] != '#'):
                    input_word += input_seq[j]
            else :
                break
        attn_weights = atten_weights[i, :len(pred_word), :len(input_word)].detach().cpu().numpy()
        ax.imshow(attn_weights.T, cmap='hot', interpolation='nearest')
        ax.xaxis.set_label_position('top')
        ax.set_title(f'Example {i+1}')
        ax.set_xlabel('Output predicted')
        ax.set_ylabel('Input word')
        ax.set_xticks(np.arange(len(pred_word)))
        ax.set_xticklabels(pred_word, rotation = 90, fontproperties = tel_font,fontdict={'fontsize':8})
        ax.xaxis.tick_top()

        ax.set_yticks(np.arange(len(input_word)))
        ax.set_yticklabels(input_word, rotation=90)



# Adjust the spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()
wandb.init(project='CS6910_Assignment_3')

# Convert the matplotlib figure to an image
fig.canvas.draw()
image = np.frombuffer(fig.canvas.tostring_rgb(), dtype='uint8')
image = image.reshape(fig.canvas.get_width_height()[::-1] + (3,))

# Log the image in wandb
wandb.log({"attention_heatmaps": [wandb.Image(image)]})