<a href="https://colab.research.google.com/github/Mustaq7777777/DL-ASSIGNMENT3/blob/main/DL_Assignment3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setup and Imports

In [None]:
#importing all necessary libraries
import torch
import torch.nn as nn
import torch.nn.functional as func
import torch.optim as optim
import torch.utils.data as data
import numpy as np
import matplotlib.pyplot as plt
import math
import os
import pandas as pd
import random
import wandb
from tqdm.auto import tqdm

# For reproducibility
def seed_everything(seed=42):
    """Set random seed for all major libraries"""
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)

# Set seed for reproducibility
seed_everything(42)

# Device selection: CPU or GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Downloading and Extracting the Dakshina Dataset

In [None]:
# Download the Dakshina dataset
!yes | wget "https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar"

# Extract the downloaded tar file
!yes | tar xopf dakshina_dataset_v1.0.tar

Data Loading and Processing Functions

In [None]:
def read_tsv(file_path):
    """Read a tab-separated file with source and target text"""
    eng_words = []
    tel_words = []
    with open(file_path, encoding='utf-8') as f:
        for ln in f:
            parts = ln.strip().split('\t')
            if len(parts) >= 2:
                tel_words.append(parts[0])  # Dakshina format has target first
                eng_words.append(parts[1])  # Source (English) second
    return eng_words, tel_words

def load_dakshina_data(language='tel', base_path=None):
    """Load transliteration data from Dakshina TSV files"""
    if base_path is None:
        # Default path structure for Dakshina
        base_path = os.path.join(
            '/kaggle/working/dakshina_dataset_v1.0',
            language, 'lexicons'
        )

    # Paths to data files
    train_file = os.path.join(base_path, f"{language}.translit.sampled.train.tsv")
    valid_file = os.path.join(base_path, f"{language}.translit.sampled.dev.tsv")
    test_file = os.path.join(base_path, f"{language}.translit.sampled.test.tsv")

    # Load data
    eng_list_train, tel_list_train = read_tsv(train_file)
    eng_list_valid, tel_list_valid = read_tsv(valid_file)
    eng_list_test, tel_list_test = read_tsv(test_file)

    # Build vocabularies
    eng_vocab = []
    tel_vocab = []
    max_eng_len = -1
    max_tel_len = -1
    max_eng_word = ""
    max_tel_word = ""

    # Process training data for vocabulary
    for word in eng_list_train:
        max_eng_len = max(max_eng_len, len(word))
        if max_eng_len == len(word):
            max_eng_word = word
        for letter in word:
            eng_vocab.append(letter)
    eng_vocab = list(set(eng_vocab))
    eng_vocab.sort()

    for word in tel_list_train:
        max_tel_len = max(max_tel_len, len(word))
        if max_tel_len == len(word):
            max_tel_word = word
        for letter in word:
            tel_vocab.append(letter)
    tel_vocab = list(set(tel_vocab))
    tel_vocab.sort()

    # Update max lengths from validation and test sets
    for word in eng_list_valid:
        max_eng_len = max(max_eng_len, len(word))
    for word in eng_list_test:
        max_eng_len = max(max_eng_len, len(word))
    for word in tel_list_test:
        max_tel_len = max(max_tel_len, len(word))
    for word in tel_list_valid:
        max_tel_len = max(max_tel_len, len(word))

    #printing the values to know about data

    print(f"English vocabulary size: {len(eng_vocab)}")
    print(f"Target language vocabulary size: {len(tel_vocab)}")
    print(f"Max English length: {max_eng_len}")
    print(f"Max target language length: {max_tel_len}")
    print(f"Training examples: {len(eng_list_train)}")

    return (eng_list_train, tel_list_train, eng_list_valid, tel_list_valid,
            eng_list_test, tel_list_test, eng_vocab, tel_vocab,
            max_eng_len, max_tel_len)

Data Vectorization

In [None]:
def word_to_vector(language, word, eng_vocab, tel_vocab, max_eng_len, max_tel_len):
    """Convert a word to its vectorial representation"""
    vec = []
    if language == "english":
        # Start token
        vec.append(len(eng_vocab) + 1)
        # Word content
        for letter in word:
            for albt in range(len(eng_vocab)):
                if eng_vocab[albt] == letter:
                    vec.append(albt + 1)
        # Padding
        while len(vec) < (max_eng_len + 1):
            vec.append(0)
        # End token
        vec.append(0)
    else:
        # Start token
        vec.append(len(tel_vocab) + 1)
        # Word content
        for letter in word:
            for albt in range(len(tel_vocab)):
                if tel_vocab[albt] == letter:
                    vec.append(albt + 1)
        # Padding
        while len(vec) < (max_tel_len + 1):
            vec.append(0)
        # End token
        vec.append(0)
    return vec

def prepare_matrices(eng_list, tel_list, eng_vocab, tel_vocab, max_eng_len, max_tel_len):
    """Create tensor matrices from word lists"""
    eng_matrix = []
    tel_matrix = []

    for word in eng_list:
        eng_matrix.append(word_to_vector("english", word, eng_vocab, tel_vocab, max_eng_len, max_tel_len))

    for word in tel_list:
        tel_matrix.append(word_to_vector("telugu", word, eng_vocab, tel_vocab, max_eng_len, max_tel_len))

    return torch.tensor(eng_matrix), torch.tensor(tel_matrix)

Loading data

In [None]:
# Load the data
data = load_dakshina_data('tel')
(eng_list_train, tel_list_train, eng_list_valid, tel_list_valid,
 eng_list_test, tel_list_test, eng_vocab, tel_vocab,
 max_eng_len, max_tel_len) = data

# Prepare matrices
eng_matrix_train, tel_matrix_train = prepare_matrices(
    eng_list_train, tel_list_train, eng_vocab, tel_vocab, max_eng_len, max_tel_len
)

eng_matrix_valid, tel_matrix_valid = prepare_matrices(
    eng_list_valid, tel_list_valid, eng_vocab, tel_vocab, max_eng_len, max_tel_len
)

eng_matrix_test, tel_matrix_test = prepare_matrices(
    eng_list_test, tel_list_test, eng_vocab, tel_vocab, max_eng_len, max_tel_len
)

print(f"Training matrices shape: English {eng_matrix_train.shape}, Telugu {tel_matrix_train.shape}")
print(f"Validation matrices shape: English {eng_matrix_valid.shape}, Telugu {tel_matrix_valid.shape}")
print(f"Test matrices shape: English {eng_matrix_test.shape}, Telugu {tel_matrix_test.shape}")

Encoder

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, enc_layers, hidden_size,
                 cell_type, bi_directional_bit, dropout, batch_size):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.enc_layers = enc_layers
        self.cell_type = cell_type
        self.bi_directional_bit = bi_directional_bit
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.hidden_size = hidden_size
        self.batch_size = batch_size

        # Initialize RNN based on cell type
        if cell_type == "RNN":
            self.rnn = nn.RNN(embedding_size, hidden_size, enc_layers,
                             dropout=dropout, bidirectional=bi_directional_bit)
        elif cell_type == "GRU":
            self.gru = nn.GRU(embedding_size, hidden_size, enc_layers,
                             dropout=dropout, bidirectional=bi_directional_bit)
        else:  # LSTM
            self.lstm = nn.LSTM(embedding_size, hidden_size, enc_layers,
                               dropout=dropout, bidirectional=bi_directional_bit)

    def forward(self, x, hidden, cell):
        """Forward pass through the encoder"""
        # Apply embedding and reshape
        embedding = self.embedding(x).view(-1, self.batch_size, self.embedding_size)

        # Pass through the appropriate RNN type
        if self.cell_type == "RNN":
            output, hidden = self.rnn(embedding, hidden)
        elif self.cell_type == "GRU":
            output, hidden = self.gru(embedding, hidden)
        else:  # LSTM
            output, (hidden, cell) = self.lstm(embedding, (hidden, cell))
            return output, hidden, cell

        return output, hidden

    def initialize_hidden(self):
        """Initialize hidden state tensor"""
        if self.bi_directional_bit:
            return torch.zeros(2 * self.enc_layers, self.batch_size,
                               self.hidden_size, device=device)
        return torch.zeros(self.enc_layers, self.batch_size,
                           self.hidden_size, device=device)

    def initialize_cell(self):
        """Initialize cell state tensor (for LSTM)"""
        if self.bi_directional_bit:
            return torch.zeros(2 * self.enc_layers, self.batch_size,
                               self.hidden_size, device=device)
        return torch.zeros(self.enc_layers, self.batch_size,
                           self.hidden_size, device=device)

Bahdanau Attention Mechanism

In [None]:
class BahdanauAttention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear(enc_hid_dim + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden: [batch_size, dec_hid_dim]
        # encoder_outputs: [src_len, batch_size, enc_hid_dim]

        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]

        # Repeat hidden for src_len times
        # [batch_size, dec_hid_dim] -> [batch_size, src_len, dec_hid_dim]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)

        # Transpose encoder outputs for attention calculation
        # [src_len, batch_size, enc_hid_dim] -> [batch_size, src_len, enc_hid_dim]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        # Calculate attention scores
        # [batch_size, src_len, enc_hid_dim + dec_hid_dim] -> [batch_size, src_len, dec_hid_dim]
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))

        # [batch_size, src_len, dec_hid_dim] -> [batch_size, src_len, 1]
        attention = self.v(energy)

        # [batch_size, src_len, 1] -> [batch_size, src_len]
        attention = attention.squeeze(2)

        # Apply softmax to get attention weights
        # [batch_size, src_len]
        return func.softmax(attention, dim=1)

Decoder (without attention)

In [None]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, dec_layers,
                 dropout, cell_type, output_size):
        super(Decoder, self).__init__()
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.dec_layers = dec_layers
        self.dropout = nn.Dropout(dropout)
        self.cell_type = cell_type
        self.embedding = nn.Embedding(input_size, embedding_size)

        # Initialize RNN based on cell type
        if cell_type == "RNN":
            self.rnn = nn.RNN(embedding_size, hidden_size, dec_layers, dropout=dropout)
        elif cell_type == "GRU":
            self.gru = nn.GRU(embedding_size, hidden_size, dec_layers, dropout=dropout)
        else:  # LSTM
            self.lstm = nn.LSTM(embedding_size, hidden_size, dec_layers, dropout=dropout)

        # Output projection
        self.fully_conc = nn.Linear(hidden_size, output_size)

    def forward(self, x, prev_output, prev_hidden, cell=0):
        """Forward pass through the decoder"""
        # Reshape input token and apply embedding
        x = x.unsqueeze(0).int()
        embedding = self.embedding(x)
        embedding = self.dropout(embedding)

        # Pass through the appropriate RNN type
        if self.cell_type == "RNN":
            outputs, hidden = self.rnn(embedding, prev_hidden)
        elif self.cell_type == "GRU":
            outputs, hidden = self.gru(embedding, prev_hidden)
        else:  # LSTM
            outputs, (hidden, cell) = self.lstm(embedding, (prev_hidden, cell))

        # Project to vocabulary size
        pred = self.fully_conc(outputs)
        pred = pred.squeeze(0)  # Remove sequence dimension

        if self.cell_type == "GRU" or self.cell_type == "RNN":
            return pred, hidden

        return pred, hidden, cell

Decoder(with Attention)

In [None]:
class AttentionDecoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size,
                 cell_type, dec_layers, dropout, bi_directional_bit):
        super(AttentionDecoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.cell_type = cell_type
        self.dec_layers = dec_layers
        self.bi_directional_bit = bi_directional_bit
        self.embedding_size = embedding_size
        self.dropout = nn.Dropout(dropout)

        # Embedding layer
        self.embedding = nn.Embedding(input_size, embedding_size)

        # Attention mechanism
        self.attention = BahdanauAttention(hidden_size, hidden_size)

        # RNN input dimension (embedding + context)
        self.rnn_input_dim = embedding_size + hidden_size

        # Initialize RNN based on cell type
        if cell_type == "LSTM":
            self.lstm = nn.LSTM(self.rnn_input_dim, hidden_size, dec_layers, dropout=dropout)
        elif cell_type == "GRU":
            self.gru = nn.GRU(self.rnn_input_dim, hidden_size, dec_layers, dropout=dropout)
        else:  # RNN
            self.rnn = nn.RNN(self.rnn_input_dim, hidden_size, dec_layers, dropout=dropout)

        # Output projection (combines hidden state, context vector, and embedding)
        self.fully_conc = nn.Linear(hidden_size + hidden_size + embedding_size, output_size)

    def forward(self, x, encoder_outputs, prev_hidden, cell=0):
        """Forward pass with attention mechanism"""
        # Get the last layer's hidden state
        if self.cell_type == 'LSTM':
            attention_hidden = prev_hidden[0][-1]
        else:
            attention_hidden = prev_hidden[-1]

        # Calculate attention weights
        attn_weights = self.attention(attention_hidden, encoder_outputs)

        # Create context vector by applying attention weights to encoder outputs
        # [batch_size, src_len] -> [batch_size, 1, src_len]
        attn_weights = attn_weights.unsqueeze(1)

        # [src_len, batch_size, enc_hid_dim] -> [batch_size, src_len, enc_hid_dim]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        # [batch_size, 1, src_len] x [batch_size, src_len, enc_hid_dim] -> [batch_size, 1, enc_hid_dim]
        context = torch.bmm(attn_weights, encoder_outputs)

        # Embed input token
        x = x.unsqueeze(0)  # Add sequence dimension
        embedded = self.embedding(x)

        # Combine embedding and context for RNN input
        # [1, batch_size, emb_dim], [batch_size, 1, enc_hid_dim] -> [1, batch_size, emb_dim + enc_hid_dim]
        rnn_input = torch.cat((embedded, context.permute(1, 0, 2)), dim=2)

        # Pass through the appropriate RNN type
        if self.cell_type == "RNN":
            outputs, hidden = self.rnn(rnn_input, prev_hidden)
        elif self.cell_type == "GRU":
            outputs, hidden = self.gru(rnn_input, prev_hidden)
        else:  # LSTM
            outputs, (hidden, cell) = self.lstm(rnn_input, (prev_hidden, cell))

        # For output projection, combine hidden state, context, and embedded input
        outputs = outputs.squeeze(0)  # Remove sequence dimension
        embedded = embedded.squeeze(0)  # Remove sequence dimension
        context = context.squeeze(1)   # Remove extra dimension

        # Project to vocabulary size
        pred = self.fully_conc(torch.cat((outputs, context, embedded), dim=1))

        if self.cell_type == "GRU" or self.cell_type == "RNN":
            return pred, hidden
        else:
            return pred, hidden, cell

Seq 2 Seq Model

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, decoder, encoder, cell_type, bidirectional_bit,
                 encoder_layers, decoder_layers):
        super(Seq2Seq, self).__init__()
        self.decoder = decoder
        self.encoder = encoder
        self.cell_type = cell_type
        self.bidirectional_bit = bidirectional_bit
        self.encoder_layers = encoder_layers
        self.decoder_layers = decoder_layers

    def forward(self, input_seq, target, teacher_force_ratio=0.5):
        """Forward pass through the sequence-to-sequence model"""
        batch_size = input_seq.shape[1]
        tar_seq_length = target.shape[0]
        final_target_vocab_size = self.decoder.output_size

        # Initialize outputs tensor
        outputs = torch.zeros(tar_seq_length, batch_size,
                             final_target_vocab_size).to(device=device)

        # Initialize encoder states
        hidden = self.encoder.initialize_hidden()
        cell = self.encoder.initialize_cell()

        # Encode input sequence
        if self.cell_type == "RNN" or self.cell_type == "GRU":
            encoder_output, hidden = self.encoder(input_seq, hidden, cell)
        else:  # LSTM
            encoder_output, hidden, cell = self.encoder(input_seq, hidden, cell)

        # Handle bidirectional encoder or different layer counts
        if self.decoder_layers != self.encoder_layers or self.bidirectional_bit:
            if self.cell_type in ["RNN", "GRU", "LSTM"]:
                # Combine bidirectional hidden states if needed
                if self.bidirectional_bit:
                    # Sum forward and backward directions
                    hidden_forward = hidden[:self.encoder_layers]
                    hidden_backward = hidden[self.encoder_layers:]
                    hidden = hidden_forward + hidden_backward

                # Match decoder layers
                if self.decoder_layers > 1 and self.encoder_layers == 1:
                    hidden = hidden.repeat(self.decoder_layers, 1, 1)

            if self.cell_type == "LSTM":
                # Also handle cell states for LSTM
                if self.bidirectional_bit:
                    # Sum forward and backward directions
                    cell_forward = cell[:self.encoder_layers]
                    cell_backward = cell[self.encoder_layers:]
                    cell = cell_forward + cell_backward

                # Match decoder layers
                if self.decoder_layers > 1 and self.encoder_layers == 1:
                    cell = cell.repeat(self.decoder_layers, 1, 1)

        # Start with first token (SOS token)
        x = target[0]

        # Generate sequence
        for t in range(1, tar_seq_length):
            # Process through decoder
            if self.cell_type == "RNN" or self.cell_type == "GRU":
                output, hidden = self.decoder(x, encoder_output, hidden)
            else:  # LSTM
                output, hidden, cell = self.decoder(x, encoder_output, hidden, cell)

            # Store output
            outputs[t] = output

            # Teacher forcing: use target token with probability teacher_force_ratio
            if random.random() < teacher_force_ratio:
                x = target[t]
            else:
                # Otherwise use model's prediction
                predicted = output.argmax(1)
                x = predicted

        return outputs

Training and Evaluation functions

In [None]:
def accuracy_fun(eng_matrix, tel_matrix, batch_size, model):
    """Compute accuracy on a dataset"""
    correct = 0
    model.eval()  # Set model to evaluation mode

    with torch.no_grad():
        for batch_id in range(int(len(eng_matrix) / batch_size)):
            # Get batch
            inp_word = eng_matrix[batch_size * batch_id:batch_size * (batch_id + 1)].to(device=device)
            out_word = tel_matrix[batch_size * batch_id:batch_size * (batch_id + 1)].to(device=device)

            # Transpose for sequence-first format
            inp_word = inp_word.T
            out_word = out_word.T

            # Forward pass with no teacher forcing
            output = model.forward(inp_word, out_word, 0)

            # Get predictions
            output = nn.Softmax(dim=2)(output)
            output = torch.argmax(output, dim=2)

            # Transpose back to batch-first for comparison
            output = output.T
            out_word = out_word.T

            # Count correct predictions (exact match of entire sequence)
            for i in range(min(batch_size, len(inp_word))):  # Handle last batch which may be smaller
                if torch.equal(output[i][1:], out_word[i][1:]):
                    correct += 1

    # Return accuracy percentage
    return (correct * 100) / len(eng_matrix)

def vectors_to_actual_words(model, eng_matrix, tel_matrix, batch_size, eng_vocab, tel_vocab, data_type):
    """Convert model predictions to readable words"""
    results = []
    model.eval()

    with torch.no_grad():
        for batch_id in range(int(len(eng_matrix) / batch_size)):
            # Get batch
            input_batch = eng_matrix[batch_id * batch_size:batch_size * (batch_id + 1)].to(device=device)
            output_batch = tel_matrix[batch_id * batch_size:batch_size * (batch_id + 1)].to(device=device)

            # Forward pass
            model_output = model.forward(input_batch.T, output_batch.T, 0)
            model_output = nn.Softmax(dim=2)(model_output)
            model_output = torch.argmax(model_output, dim=2)
            model_output = model_output.T

            # Process each example
            for idx in range(len(output_batch)):
                res_word = output_batch[idx]
                pred_word = model_output[idx]
                inp_word = input_batch[idx]

                # Convert to strings
                word_res = ""
                word_pred = ""
                word_inp = ""

                # Convert prediction to string
                for i in range(len(pred_word)):
                    if pred_word[i] > 0 and pred_word[i] < len(tel_vocab) + 1:
                        word_pred += tel_vocab[pred_word[i] - 1]

                # Convert input to string
                for i in range(len(inp_word)):
                    if inp_word[i] > 0 and inp_word[i] < len(eng_vocab) + 1:
                        word_inp += eng_vocab[inp_word[i] - 1]

                # Convert target to string
                for i in range(len(res_word)):
                    if res_word[i] > 0 and res_word[i] < len(tel_vocab) + 1:
                        word_res += tel_vocab[res_word[i] - 1]

                results.append((word_inp, word_pred, word_res))

    return results

def save_to_csv(results, filename):
    """Save results to a CSV file"""
    with open(filename, 'w', encoding='utf-8') as f:
        f.write("Source,Predicted,Target\n")
        for src, pred, tgt in results:
            f.write(f"{src},{pred},{tgt}\n")

Training and Evaluating the models

In [None]:
def train_and_evaluate(cell_type, bi_directional_bit, embedding_size, enc_dropout,
                     dec_dropout, enc_layers, dec_layers, hidden_size, batch_size,
                     attention_bit, learning_rate, max_epochs, language='tel',
                     use_wandb=False):
    """Train and evaluate a seq2seq model"""
    # Initialize wandb if requested
    if use_wandb:
        run_name = f"{cell_type}_{enc_layers}l_{embedding_size}e_{hidden_size}h_" \
                  f"{'attn' if attention_bit else 'no_attn'}_" \
                  f"{'bid' if bi_directional_bit else 'uni'}"

        wandb.init(
            project="DL_assignment_3",
            name=run_name,
            config={
                "cell_type": cell_type,
                "bi_directional": bi_directional_bit,
                "embedding_size": embedding_size,
                "enc_dropout": enc_dropout,
                "dec_dropout": dec_dropout,
                "enc_layers": enc_layers,
                "dec_layers": dec_layers,
                "hidden_size": hidden_size,
                "batch_size": batch_size,
                "attention": attention_bit,
                "learning_rate": learning_rate,
                "max_epochs": max_epochs,
                "language": language
            }
        )

    # Get data from global variables
    # (To keep the code structure aligned with the original)

    # Model dimensions
    enc_input_size = len(eng_vocab) + 2  # +2 for special tokens
    dec_input_size = len(tel_vocab) + 2
    output_size = len(tel_vocab) + 2

    # Create encoder
    encoder_section = Encoder(
        enc_input_size, embedding_size, enc_layers, hidden_size,
        cell_type, bi_directional_bit, enc_dropout, batch_size
    ).to(device=device)

    # Create decoder (with or without attention)
    if attention_bit:
        decoder_section = AttentionDecoder(
            dec_input_size, embedding_size, hidden_size, output_size,
            cell_type, dec_layers, dec_dropout, bi_directional_bit
        ).to(device=device)
    else:
        decoder_section = Decoder(
            dec_input_size, embedding_size, hidden_size, dec_layers,
            dec_dropout, cell_type, output_size
        ).to(device=device)

    # Create sequence-to-sequence model
    model = Seq2Seq(
        decoder_section, encoder_section, cell_type,
        bi_directional_bit, enc_layers, dec_layers
    ).to(device=device)

    # Create optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create loss function (ignoring padding)
    pad = len(tel_vocab) + 1
    loss_criterion = nn.CrossEntropyLoss(ignore_index=pad)

    # Main training loop
    print(f"Starting training for {max_epochs} epochs")

    for epoch in range(max_epochs):
        print(f"Epoch: {epoch+1}/{max_epochs}")

        # Set to training mode
        model.train()
        total_loss = 0
        step = 0

        # Training batches with progress bar
        batch_count = int(len(eng_matrix_train) / batch_size)
        progress_bar = tqdm(range(batch_count), desc=f"Training {epoch+1}")

        for batch_id in progress_bar:
            # Get batch data
            inp_word = eng_matrix_train[batch_size * batch_id:batch_size * (batch_id + 1)].to(device=device)
            out_word = tel_matrix_train[batch_size * batch_id:batch_size * (batch_id + 1)].to(device=device)

            # Transpose for sequence-first format
            out_word = out_word.T
            inp_word = inp_word.T

            # Forward pass
            output = model(inp_word, out_word)

            # Calculate loss (skip first token which is SOS)
            output = output[1:].reshape(-1, output.shape[2])
            out_word = out_word[1:].reshape(-1)

            # Zero gradients
            optimizer.zero_grad()

            # Compute loss
            loss = loss_criterion(output, out_word)
            total_loss += loss.item()

            # Backpropagation
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            # Update weights
            optimizer.step()

            # Update progress bar
            progress_bar.set_postfix(loss=loss.item())
            step += 1

        # Calculate epoch average loss
        avg_loss = total_loss / step
        print(f"Total loss: {avg_loss:.4f}")

        # Evaluate on train, validation, and test sets
        train_acc = accuracy_fun(eng_matrix_train, tel_matrix_train, batch_size, model)
        valid_acc = accuracy_fun(eng_matrix_valid, tel_matrix_valid, batch_size, model)
        test_acc = accuracy_fun(eng_matrix_test, tel_matrix_test, batch_size, model)

        print(f"Train accuracy: {train_acc:.2f}%")
        print(f"Valid accuracy: {valid_acc:.2f}%")
        print(f"Test accuracy: {test_acc:.2f}%")

        # Log to wandb if enabled
        if use_wandb:
            wandb.log({
                'epoch': epoch + 1,
                'loss': avg_loss,
                'train_accuracy': train_acc,
                'valid_accuracy': valid_acc,
                'test_accuracy': test_acc
            })

    # Generate and save predictions
    test_results = vectors_to_actual_words(
        model, eng_matrix_test, tel_matrix_test, batch_size,
        eng_vocab, tel_vocab, 'Test'
    )
    save_to_csv(test_results, f"predictions_{cell_type}_{attention_bit}.csv")

    # Close wandb run if used
    if use_wandb:
        wandb.finish()

    return model, (train_acc, valid_acc, test_acc), test_results

Training the models and comparing results

In [None]:
# Configuration for model without attention
config_no_attention = {
    'cell_type': 'GRU',
    'bi_directional_bit': True,
    'embedding_size': 256,
    'enc_dropout': 0.2,
    'dec_dropout': 0.2,
    'enc_layers': 2,
    'dec_layers': 2,
    'hidden_size': 512,
    'batch_size': 64,
    'attention_bit': False,
    'learning_rate': 0.001,
    'max_epochs': 10,
    'language': 'tel',
    'use_wandb': False  # Set to True to log to wandb
}

# Train model without attention
no_attention_model, no_attention_accuracies, no_attention_results = train_and_evaluate(**config_no_attention)

print("\nFinal results without attention:")
print(f"Train accuracy: {no_attention_accuracies[0]:.2f}%")
print(f"Valid accuracy: {no_attention_accuracies[1]:.2f}%")
print(f"Test accuracy: {no_attention_accuracies[2]:.2f}%")

# Create a bar chart to compare accuracies
labels = ['Train', 'Valid', 'Test']
no_attention_accs = [no_attention_accuracies[0], no_attention_accuracies[1], no_attention_accuracies[2]]
attention_accs = [attention_accuracies[0], attention_accuracies[1], attention_accuracies[2]]

x = np.arange(len(labels))
width = 0.35

fig, ax = plt.subplots(figsize=(10, 6))
rects1 = ax.bar(x - width/2, no_attention_accs, width, label='Without Attention')
rects2 = ax.bar(x + width/2, attention_accs, width, label='With Attention')

ax.set_ylabel('Accuracy (%)')
ax.set_title('Seq2Seq Model Accuracy: With vs. Without Attention')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

# Add value labels
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.2f}%',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

plt.tight_layout()
plt.show()

Hyperparameter sweep

In [None]:
def run_hyperparameter_sweep(with_attention=True):
    """Run hyperparameter sweep with wandb"""
    sweep_name = 'Transliteration_with_Attention' if with_attention else 'Transliteration_without_Attention'

    # Define sweep configuration
    sweep_cfg = {
        'method': 'bayes',  # Use Bayesian optimization
        'name': sweep_name,
        'metric': {'name': 'val_acc', 'goal': 'maximize'},
        'parameters': {
            # Model architecture
            'emb_size': {'values': [128, 256, 512]},
            'hidden_size': {'values': [128, 256, 512, 1024]},
            'enc_layers': {'values': [1, 2, 3, 4]},
            'cell': {'values': ['RNN', 'GRU', 'LSTM']},
            'bidirectional': {'values': [True, False]},  # Bidirectional encode

            # Training parameters
            'dropout': {'values': [0.0, 0.1, 0.2, 0.3, 0.5]},
            'lr': {'values': [1e-4, 2e-4, 5e-4, 8e-4, 1e-3]},
            'batch_size': {'values': [32, 64, 128]},
            'epochs': {'values': [10, 15, 20]},
            'teacher_forcing': {'values': [0.3, 0.5, 0.7, 1.0]},  # Explicit teacher forcing
            'optimizer': {'values': ['Adam', 'NAdam']},  # Added optimizer options
            # Reproducibility
            'seed': {'values': [42, 43, 44, 45, 46]},  # Different seeds for robustness
        }
    }

    # Define the objective function for sweep
    def sweep_objective():
        run = wandb.init()
        config = run.config

        # Set seed for reproducibility
        seed_everything(config.seed)

        # Train model with this configuration
        train_and_evaluate(
            config.cell,
            config.bidirectional,
            config.emb_size,
            config.dropout,
            config.dropout,
            config.enc_layers,
            config.enc_layers,
            config.hidden_size,
            config.batch_size,
            with_attention,
            config.lr,
            config.epochs
        )

    # Initialize sweep
    entity = 'cs24m045-indian-institute-of-technology-madras'  # Replace with your wandb entity
    project = 'DA6401-Assignment-3'

    # Start sweep (uncomment to run)
    # sweep_id = wandb.sweep(sweep_cfg, entity=entity, project=project)
    # wandb.agent(sweep_id, function=sweep_objective, count=20)

# Uncomment to run hyperparameter sweeps
# run_hyperparameter_sweep(with_attention=True)   # For attention model
# run_hyperparameter_sweep(with_attention=False)  # For no-attention model