In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader 

from tqdm import tqdm
import heapq
import csv

import numpy as np
import random
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import pandas as pd
import wandb

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [2]:
def data_load(path):
    # input - English
    # output - Urdu
    df = pd.read_csv(path,sep='\t', header=None)
    input_data = df[1].tolist()
    output_data = df[0].tolist()
    return input_data, output_data
    
def create_char_set(train, val):
    char_set = set()
    for word in train:
        for char in str(word):
            char_set.add(char)
    for word in val:
        for char in str(word):
            char_set.add(char)
    return char_set

def check_for_floats(data_list):
    float_values = []
    for item in data_list:
        if isinstance(item, float):
            float_values.append(item)
    return float_values

In [3]:
train_input1, train_output = data_load("/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/ur/lexicons/ur.translit.sampled.train.tsv")
val_input, val_output = data_load("/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/ur/lexicons/ur.translit.sampled.dev.tsv")
test_input, test_output = data_load("/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/ur/lexicons/ur.translit.sampled.test.tsv") 

print("Number of training samples: ", len(train_input1))
print("Number of validation samples: ", len(val_input))
print("Number of test samples: ", len(test_input))


Number of training samples:  106260
Number of validation samples:  10424
Number of test samples:  10517


In [4]:
eng_chars = create_char_set(train_input1, val_input)
print("Total English characters: ",len(eng_chars))
print(sorted(eng_chars))


Total English characters:  26
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [5]:
ur_chars = create_char_set(train_output, val_output)
print("Total Urdu characters: ",len(ur_chars))
print(sorted(ur_chars))


Total Urdu characters:  54
['ء', 'آ', 'ؤ', 'ئ', 'ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ي', 'ً', 'َ', 'ُ', 'ِ', 'ّ', 'ٗ', 'ٰ', 'ٹ', 'پ', 'چ', 'ڈ', 'ڑ', 'ژ', 'ک', 'گ', 'ں', 'ھ', 'ہ', 'ۃ', 'ی', 'ے', 'ۓ']


In [6]:
float_values=check_for_floats(train_input1)
print(float_values)

[nan, nan]


In [7]:
train_input = [x for x in train_input1 if not isinstance(x,float)]
print("Number of training samples: ", len(train_input))

Number of training samples:  106258


In [8]:
max_seq_eng = len(max(train_input+val_input+test_input, key=len))
max_seq_ur = len(max(train_output+val_output+test_output, key=len))
print("Length of the longest English word in corpus:",max_seq_eng)
print("Length of the longest Urdu word in corpus::",max_seq_ur)


Length of the longest English word in corpus: 21
Length of the longest Urdu word in corpus:: 14


In [9]:
"""eng_chars_idx = {char: idx + 3 for idx, char in enumerate(sorted(eng_chars))}
eng_chars_idx['0'] = 0 # padding
eng_chars_idx['\t'] = 1 # <SOW>
eng_chars_idx['\n'] = 2 # <EOW>
print(eng_chars_idx)
ur_chars_idx = {char: idx+3 for idx, char in enumerate(sorted(ur_chars))}
ur_chars_idx['0'] = 0 # padding
ur_chars_idx['\t'] = 1 # <SOW>
ur_chars_idx['\n'] = 2 # <EOW>
print(ur_chars_idx)
"""

"eng_chars_idx = {char: idx + 3 for idx, char in enumerate(sorted(eng_chars))}\neng_chars_idx['0'] = 0 # padding\neng_chars_idx['\t'] = 1 # <SOW>\neng_chars_idx['\n'] = 2 # <EOW>\nprint(eng_chars_idx)\nur_chars_idx = {char: idx+3 for idx, char in enumerate(sorted(ur_chars))}\nur_chars_idx['0'] = 0 # padding\nur_chars_idx['\t'] = 1 # <SOW>\nur_chars_idx['\n'] = 2 # <EOW>\nprint(ur_chars_idx)\n"

In [10]:
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd

def load_data(input_data, output_data, batch_size=32):
    """
    Prepares character-level transliteration data for sequence-to-sequence modeling.

    Args:
        input_data (list or Series): Romanized source words (e.g., Urdu in Latin script).
        output_data (list or Series): Target words in Devanagari/Urdu script.
        batch_size (int): Batch size for the DataLoader.

    Returns:
        dataset (Dataset): Custom PyTorch dataset for transliteration.
        dataloader (DataLoader): DataLoader for iterating over the dataset.
        input_vocab (dict): Character-to-index mapping for input characters.
        target_vocab (dict): Character-to-index mapping for target characters.
        max_input_len (int): Maximum input sequence length.
        max_target_len (int): Maximum target sequence length.
    """

    # Determine the maximum lengths for padding
    max_input_len = len(max(input_data, key=len))
    max_target_len = len(max(output_data, key=len))

    # Initialize character vocabularies with special tokens
    input_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2}
    target_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2}
    next_index = 3

    # Extract all unique characters from the dataset
    all_input_chars = ''.join(input_data)
    all_target_chars = ''.join(output_data)

    # Create input vocabulary
    for char in sorted(set(all_input_chars)):
        input_vocab[char] = next_index
        next_index += 1

    # Reset index for building target vocabulary
    next_index = 3
    for char in sorted(set(all_target_chars)):
        if char not in target_vocab:
            target_vocab[char] = next_index
            next_index += 1

    # Tokenize input characters and pad to max_input_len
    def tokenize_input(word, vocab, max_len):
        token_ids = [vocab[char] for char in word if char in vocab]
        padded = token_ids[:max_len] + [vocab['<pad>']] * (max_len - len(token_ids))
        return torch.tensor(padded)

    # Tokenize target characters with <sos> and <eos>, then pad
    def tokenize_target(word, vocab, max_len):
        token_ids = [vocab[char] for char in word if char in vocab]
        padded = [vocab['<sos>']] + token_ids[:max_len] + [vocab['<eos>']]
        padded += [vocab['<pad>']] * (max_len + 2 - len(padded))  # +2 for <sos> and <eos>
        return torch.tensor(padded)

    # Define a custom dataset for transliteration pairs
    class TransliterationDataset(Dataset):
        def __init__(self, input_words, target_words, input_vocab, target_vocab, max_input_len, max_target_len):
            self.input_words = input_words
            self.target_words = target_words
            self.input_vocab = input_vocab
            self.target_vocab = target_vocab
            self.max_input_len = max_input_len
            self.max_target_len = max_target_len

        def __len__(self):
            return len(self.input_words)

        def __getitem__(self, idx):
            input_word = self.input_words[idx]
            target_word = self.target_words[idx]

            # Convert to tensor of indices
            input_tensor = tokenize_input(input_word, self.input_vocab, self.max_input_len)
            target_tensor = tokenize_target(target_word, self.target_vocab, self.max_target_len)

            return input_tensor, target_tensor

    # Create dataset and dataloader
    dataset = TransliterationDataset(input_data, output_data,
                                     input_vocab, target_vocab,
                                     max_input_len, max_target_len)

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    return dataset, dataloader, input_vocab, target_vocab, max_input_len, max_target_len


In [11]:
dataset, dataloader, input_vocab, target_vocab, max_input_len, max_target_len = load_data(train_input+val_input+test_input, train_output+val_output+test_output,batch_size = 64)
print('All English Characters:\n',input_vocab,'\n All Urdu Characters:\n', target_vocab,'\n Length of  longest English word:', max_input_len,'\n Length of  longest Urdu word:', max_target_len) 

All English Characters:
 {'<pad>': 0, '<sos>': 1, '<eos>': 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'q': 19, 'r': 20, 's': 21, 't': 22, 'u': 23, 'v': 24, 'w': 25, 'x': 26, 'y': 27, 'z': 28} 
 All Urdu Characters:
 {'<pad>': 0, '<sos>': 1, '<eos>': 2, 'ء': 3, 'آ': 4, 'ؤ': 5, 'ئ': 6, 'ا': 7, 'ب': 8, 'ت': 9, 'ث': 10, 'ج': 11, 'ح': 12, 'خ': 13, 'د': 14, 'ذ': 15, 'ر': 16, 'ز': 17, 'س': 18, 'ش': 19, 'ص': 20, 'ض': 21, 'ط': 22, 'ظ': 23, 'ع': 24, 'غ': 25, 'ف': 26, 'ق': 27, 'ك': 28, 'ل': 29, 'م': 30, 'ن': 31, 'ه': 32, 'و': 33, 'ي': 34, 'ً': 35, 'َ': 36, 'ُ': 37, 'ِ': 38, 'ّ': 39, 'ٗ': 40, 'ٰ': 41, 'ٹ': 42, 'پ': 43, 'چ': 44, 'ڈ': 45, 'ڑ': 46, 'ژ': 47, 'ک': 48, 'گ': 49, 'ں': 50, 'ھ': 51, 'ہ': 52, 'ۃ': 53, 'ی': 54, 'ے': 55, 'ۓ': 56} 
 Length of  longest English word: 21 
 Length of  longest Urdu word: 14


## Seq2Seq Model

In [12]:
# Encoder: Encodes input sequence into context vector
class Encoder(nn.Module):
    def __init__(self, input_vocab_size, hidden_size, embedding_dim,
                 num_layers=1, dropout=0.5, cell_type='gru', bidirectional=False):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.cell_type = cell_type
        self.bidirectional = bidirectional

        self.embedding = nn.Embedding(input_vocab_size, embedding_dim)
        self.dropout = nn.Dropout(dropout)

        rnn_cls = {'lstm': nn.LSTM, 'gru': nn.GRU, 'rnn': nn.RNN}[cell_type]
        self.rnn = rnn_cls(
            embedding_dim, hidden_size, num_layers,
            dropout=dropout, bidirectional=bidirectional, batch_first=True
        )

    def forward(self, input_seq):
        # Embed and apply dropout
        embedded = self.dropout(self.embedding(input_seq))

        # Pass through RNN
        rnn_output, hidden_state = self.rnn(embedded)

        # Handle LSTM: return both hidden and cell state
        if self.cell_type == 'lstm':
            hidden, cell = hidden_state
            if self.bidirectional:
                # Combine forward and backward hidden states by summing
                return torch.sum(hidden[-2:], dim=0, keepdim=True), torch.sum(cell[-2:], dim=0, keepdim=True)
            else:
                return hidden[-1].unsqueeze(0), cell[-1].unsqueeze(0)
        else:
            # GRU or RNN
            if self.bidirectional:
                return torch.sum(hidden_state[-2:], dim=0, keepdim=True)
            else:
                return hidden_state[-1].unsqueeze(0)


# Decoder: Generates output sequence using encoder context
class Decoder(nn.Module):
    def __init__(self, hidden_size, embedding_dim, output_vocab_size,
                 num_layers=1, dropout=0.5, cell_type='gru'):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.cell_type = cell_type

        self.embedding = nn.Embedding(output_vocab_size, embedding_dim)
        self.dropout = nn.Dropout(dropout)

        rnn_cls = {'lstm': nn.LSTM, 'gru': nn.GRU, 'rnn': nn.RNN}[cell_type]
        self.rnn = rnn_cls(
            embedding_dim, hidden_size, num_layers,
            dropout=dropout, batch_first=True
        )

        self.fc = nn.Linear(hidden_size, output_vocab_size)

    def forward(self, input_token, hidden_state):
        # Add time dimension for RNN
        input_token = input_token.unsqueeze(1)

        # Embed input token and apply dropout
        embedded = self.dropout(self.embedding(input_token))

        # Pass through decoder RNN
        rnn_output, new_hidden_state = self.rnn(embedded, hidden_state)

        # Predict next token from output
        logits = self.fc(rnn_output)

        return logits.squeeze(1), new_hidden_state


# Sequence-to-Sequence: Combines encoder and decoder for full translation
class Seq2Seq(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, hidden_size, embedding_dim,
                 encoder_layers=1, decoder_layers=1, dropout=0.3, cell_type='gru', bidirectional=True):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(input_vocab_size, hidden_size, embedding_dim,
                               encoder_layers, dropout, cell_type, bidirectional)
        self.decoder = Decoder(hidden_size, embedding_dim, output_vocab_size,
                               decoder_layers, dropout, cell_type)
        self.output_vocab_size = output_vocab_size

    def forward(self, input_seq, target_seq, teacher_forcing_ratio=0.5):
        batch_size = input_seq.size(0)
        target_seq_len = target_seq.size(1)

        outputs = torch.zeros(batch_size, target_seq_len, self.output_vocab_size).to(input_seq.device)

        # Encode the input sequence
        encoder_hidden = self.encoder(input_seq)

        # Initialize decoder hidden state from encoder
        decoder_hidden = self._init_decoder_hidden(encoder_hidden)

        # First decoder input is <sos>
        decoder_input = target_seq[:, 0]

        for t in range(1, target_seq_len):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            outputs[:, t] = decoder_output

            # Teacher forcing: use actual target or model prediction
            use_teacher_forcing = torch.rand(1).item() < teacher_forcing_ratio
            next_input = target_seq[:, t] if use_teacher_forcing else decoder_output.argmax(1)
            decoder_input = next_input

        return outputs

    def _init_decoder_hidden(self, encoder_hidden):
        """Adjust encoder output to match decoder initial state shape."""
        decoder_layers = self.decoder.num_layers

        if self.decoder.cell_type == 'lstm':
            hidden, cell = encoder_hidden
            hidden_layers = hidden.shape[0]

            # Pad or trim hidden/cell states to match decoder's expected layers
            if hidden_layers < decoder_layers:
                pad_size = decoder_layers - hidden_layers
                hidden = torch.cat([hidden, torch.zeros(pad_size, *hidden.shape[1:], device=hidden.device)], dim=0)
                cell = torch.cat([cell, torch.zeros(pad_size, *cell.shape[1:], device=cell.device)], dim=0)
            else:
                hidden = hidden[:decoder_layers]
                cell = cell[:decoder_layers]
            return (hidden, cell)

        else:
            # GRU or RNN
            hidden_layers = encoder_hidden.shape[0]
            hidden = encoder_hidden
            if hidden_layers < decoder_layers:
                pad_size = decoder_layers - hidden_layers
                hidden = torch.cat([hidden, torch.zeros(pad_size, *hidden.shape[1:], device=hidden.device)], dim=0)
            else:
                hidden = hidden[:decoder_layers]
            return hidden


## Train and Evaluate

In [13]:
# Training function: Trains model for one epoch
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for latin_inputs, urdu_targets in dataloader:
        latin_inputs = latin_inputs.to(device)
        urdu_targets = urdu_targets.to(device)

        optimizer.zero_grad()
        output = model(latin_inputs, urdu_targets)
        output_dim = output.shape[-1]

        output_flat = output.view(-1, output_dim)
        targets_flat = urdu_targets.view(-1)

        loss = criterion(output_flat, targets_flat)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        # Compute accuracy (character-level)
        _, predictions = torch.max(output, dim=2)
        correct = (predictions == urdu_targets).sum().item()
        total_correct += correct
        total_samples += urdu_targets.size(0)

    accuracy = total_correct / total_samples
    return model, total_loss / len(dataloader), accuracy



# Evaluation function: Evaluates model performance on validation/test data
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for batch_latin, batch_urdu in dataloader:
            batch_latin = batch_latin.to(device)
            batch_urdu = batch_urdu.to(device)

            # Forward pass with no teacher forcing during evaluation
            predictions = model(batch_latin, batch_urdu, teacher_forcing_ratio=0.0)
            vocab_size = predictions.shape[-1]

            # Compute loss
            loss = criterion(predictions.view(-1, vocab_size), batch_urdu.view(-1))
            total_loss += loss.item()

            # Get predicted token indices
            _, predicted_indices = torch.max(predictions, dim=2)

            # Optional: Adjust predictions if vocab has special token offsets
            invalid_token_mask = predicted_indices > 9
            predicted_indices[invalid_token_mask] -= 2

            # Compare full sequences for exact word match
            correct_predictions = (predicted_indices == batch_urdu).sum().item()

            total_correct += correct_predictions
            total_samples += batch_urdu.size(0)

    avg_loss = total_loss / len(dataloader)
    accuracy = (total_correct / total_samples)

    return avg_loss, accuracy


# Model configuration
input_vocab_size = 26         # Number of Latin script characters
output_vocab_size = 54        # Number of Urdu script characters
embedding_dim = 128           # Embedding dimension for both encoder and decoder
hidden_size = 128             # Hidden state size for RNN cells
encoder_layers = 3            # Number of layers in encoder RNN
decoder_layers = 2            # Number of layers in decoder RNN
cell_type = 'lstm'            # RNN cell type: 'rnn', 'gru', or 'lstm'
batch_size = 64               # Batch size during training
num_epochs = 20               # Total number of training epochs
dropout = 0.2                 # Dropout probability
learning_rate = 0.001         # Learning rate for optimizer
bidirectional = True          # Whether the encoder is bidirectional

# Initialize model, loss function, and optimizer
model = Seq2Seq(input_vocab_size, output_vocab_size, hidden_size, embedding_dim,
                encoder_layers, decoder_layers, dropout, cell_type, bidirectional)

print(model)


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(26, 128)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(128, 128, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(54, 128)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(128, 128, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=128, out_features=54, bias=True)
  )
)


In [14]:
wandb.login(key='53b259076c07d0811d73bf26bfef7437e04dbf66')

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mma23m013[0m ([33mma23m013-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [15]:
# Sweep configuration for hyperparameter optimization using Weights & Biases
sweep_config = {
    'method': 'bayes',  # Bayesian optimization for more efficient search
    'metric': {
        'name': 'val_accuracy',  # Metric to optimize
        'goal': 'maximize'       # Maximize validation accuracy
    },
    'parameters': {
        'embedding_size': {
            'values': [16, 32, 64, 128, 256]  # Size of embedding vectors
        },
        'dropout': {
            'values': [0.2, 0.3, 0.5]  # Dropout probability to reduce overfitting
        },
        'encoder_layers': {
            'values': [1, 2, 3]  # Number of RNN layers in the encoder
        },
        'decoder_layers': {
            'values': [1, 2, 3]  # Number of RNN layers in the decoder
        },
        'hidden_size': {
            'values': [16, 32, 64, 128, 256]  # Hidden state dimension in RNN
        },
        'rnn_cell_type': {
            'values': ['lstm', 'gru', 'rnn']  # RNN cell type to use
        },
        'use_bidirectional_encoder': {
            'values': [True, False]  # Whether encoder is bidirectional
        },
        'batch_size': {
            'values': [32, 64]  # Batch size during training
        },
        'num_epochs': {
            'values': [10, 12]  # Number of training epochs
        },
        'learning_rate': {
            'values': [0.01, 0.001]  # Learning rate for optimizer
        }
    }
}

# Launch sweep on Weights & Biases
sweep_id = wandb.sweep(sweep=sweep_config, project='DA6401_Assignment-3')


Create sweep with ID: rqyz759c
Sweep URL: https://wandb.ai/ma23m013-iit-madras/DA6401_Assignment-3/sweeps/rqyz759c


In [16]:
def run_training():
    '''
    This function is executed by WandB for each set of hyperparameters during the sweep.
    It initializes the model using the current configuration, trains and evaluates it,
    and logs the metrics to Weights & Biases.
    '''
    with wandb.init() as run:
        # Create a descriptive run name using the current hyperparameters
        run_name = (
            f"cell-{wandb.config.rnn_cell_type}"
            f"_encLayers-{wandb.config.encoder_layers}"
            f"_decLayers-{wandb.config.decoder_layers}"
            f"_dropout-{wandb.config.dropout}"
            f"_embedSize-{wandb.config.embedding_size}"
            f"_hiddenSize-{wandb.config.hidden_size}"
            f"_batchSize-{wandb.config.batch_size}"
            f"_epochs-{wandb.config.num_epochs}"
            f"_lr-{wandb.config.learning_rate}"
        )
        wandb.run.name = run_name

        # Instantiate model
        model = Seq2Seq(
            input_vocab_size=30,
            output_vocab_size=60,
            hidden_size=wandb.config.hidden_size,
            embedding_dim=wandb.config.embedding_size,
            encoder_layers=wandb.config.encoder_layers,
            decoder_layers=wandb.config.decoder_layers,
            dropout=wandb.config.dropout,
            cell_type=wandb.config.rnn_cell_type,
            bidirectional=wandb.config.use_bidirectional_encoder
        )
        print(model)

        # Setup
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=wandb.config.learning_rate)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)

        # Load data
        _, train_loader, _, _, _, _ = load_data(train_input,train_output, batch_size=wandb.config.batch_size)

        _, val_loader, _, _, _, _ = load_data(val_input,val_output, batch_size=wandb.config.batch_size)

        # Train & evaluate
        for epoch in range(wandb.config.num_epochs):
            model, train_loss, train_accuracy = train(model, train_loader, criterion, optimizer, device)
            val_loss, val_accuracy = evaluate(model, val_loader, criterion, device)

            # Log metrics
            wandb.log({
                'Epoch': epoch,
                'Train Loss': train_loss,
                'Train Accuracy (%)': train_accuracy,
                'Validation Loss': val_loss,
                'Validation Accuracy (%)': val_accuracy
            })

            print(f"Epoch {epoch + 1}/{wandb.config.num_epochs} | "
                  f"Train Loss: {train_loss:.4f} | Train Accuracy: {train_accuracy:.2f}% | "
                  f"Val Loss: {val_loss:.4f} | Val Accuracy: {val_accuracy:.2f}%")

# Start the sweep
wandb.agent(sweep_id, function=run_training, count=1)
wandb.finish()


[34m[1mwandb[0m: Agent Starting Run: cn31sxzl with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	decoder_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 128
[34m[1mwandb[0m: 	encoder_layers: 3
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	num_epochs: 12
[34m[1mwandb[0m: 	rnn_cell_type: gru
[34m[1mwandb[0m: 	use_bidirectional_encoder: True
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(30, 128)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): GRU(128, 32, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(60, 128)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): GRU(128, 32, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=32, out_features=60, bias=True)
  )
)
Epoch 1/12 | Train Loss: 1.4040 | Train Accuracy: 10.46% | Val Loss: 2.1845 | Val Accuracy: 7.76%
Epoch 2/12 | Train Loss: 1.3741 | Train Accuracy: 10.57% | Val Loss: 3.4213 | Val Accuracy: 0.01%
Epoch 3/12 | Train Loss: 1.3403 | Train Accuracy: 10.64% | Val Loss: 2.2166 | Val Accuracy: 7.60%
Epoch 4/12 | Train Loss: 1.3445 | Train Accuracy: 10.60% | Val Loss: 2.1453 | Val Accuracy: 7.72%
Epoch 5/12 | Train Loss: 1.3271 | Train Accuracy: 10.64% | Val Loss: 3.3456 | Val Accuracy: 0.01%
Epoch 6/12 | Train Loss: 1.3305 | Train Accuracy: 10.62% | Val Los

0,1
Epoch,▁▂▂▃▄▄▅▅▆▇▇█
Train Accuracy (%),▁▅█▇█▇▇▇▇▅▅▅
Train Loss,█▅▂▃▁▁▃▂▂▄▄▄
Validation Accuracy (%),█▁██▁▁▇▇█▁▁█
Validation Loss,▁█▁▁▇▇▂▂▁▇█▁

0,1
Epoch,11.0
Train Accuracy (%),10.55283
Train Loss,1.36328
Validation Accuracy (%),8.01391
Validation Loss,2.18053
