## CS6910 Assignment 3
#### This code file contains all the classes and functions 
#### needed to train the Sequence to sequence model with attention mechansism 
#### I have mentioned the Reference sources that i have used to write the code in the README.me file

## Importing the required libraries

In [1]:
import os
import pandas as pd
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import csv

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Argparse

In [None]:
# Using argparse, I have define the arguments and options that my program accepts,
# and argparse will run the code, pass arguments from command line and 
# automatically generate help messages. I have given the defaults values for 
# all the arguments, so code can be run without passing any arguments.
# lastly returning the arguments to be used in the running of the code.

import argparse

parser = argparse.ArgumentParser(description="Stores all the hyperpamaters for the model.")
parser.add_argument("-wp", "--wandb_project",default="cs6910_assignment 3 new" ,type=str,
                    help="Enter the Name of your Wandb Project")
parser.add_argument("-we", "--wandb_entity", default="am22s020",type=str,
                    help="Wandb Entity used to track experiments in the Weights & Biases dashboard.")
parser.add_argument("-ws", "--wandb_sweep", default="False", type=bool,
                    help="If you want to run wandb sweep then give True")
parser.add_argument("-e", "--epochs",default="1", type=int, choices=[1, 5, 10],
                    help="Number of epochs to train neural network.")
parser.add_argument("-hs", "--hidden_size",default="256", type=int, help="no. of neurons in the hidden layer of the N/W")
parser.add_argument("-c", "--cell_type",default="lstm", type=str, choices=["lstm", "gru", "rnn"])
parser.add_argument("-nl", "--num_layers",default="2", type=int, 
                    choices=[2, 3, 4], help="number of recurrent layers")
parser.add_argument("-ems", "--embedding_size", default="256", type=int, choices=[64, 128, 256])
parser.add_argument("-bd", "--bi_directional", default="True", type=bool)

args = parser.parse_args()

wandb_project = args.wandb_project
wandb_entity = args.wandb_entity
wandb_sweep = args.wandb_sweep
num_epochs = args.epochs
hidden_size = args.hidden_size
cell_type = args.cell_type
num_layers = args.num_layers
embedding_size = args.embedding_size
bi_directional = args.bi_directional

print("wandb_project :", wandb_project , "wandb_entity: ", wandb_entity,"wandb_sweep: ",wandb_sweep,
      "epochs: ",num_epochs,"hidden_size: ",hidden_size, "cell_type: ", cell_type,
      "num_layers: ",num_layers,"embedding_size: ",embedding_size, 
      "bi_directional: ", bi_directional)

## Preparing the datasets

In [2]:
class Vocabulary():
    """
    This class(Vocabulary), builds a character-level vocabulary for a given list of words.
    It initializes the vocabulary with four special tokens (PAD, SOW, EOW, and UNK) and creates
    two dictionaries (stoi and itos) to map characters to integers and vice versa.
    Tokenizer: Tokenizes a given text into individual characters.
    build_vocabulary(): Takes a list of words and adds each unique character 
    to the vocabulary, along with a unique integer ID.
    numericalize(): Converts a given text into a list of integers, where each 
    integer corresponds to the ID of a character in the vocabulary.
    
    """
    def __init__(self):
        self.itos = {0:"<PAD>",1:"<SOW>",2:"<EOW>",3:"<UNK>"}
        self.stoi = {"<PAD>":0,"<SOW>":1,"<EOW>":2,"<UNK>":3}
        #self.freq_threshold = freq_threshold
    
    def __len__(self):
        return len(self.itos)
    
    @staticmethod
    def tokenizer(text):
        return [*text]
    
    def build_vocabulary(self, word_list):
        char_list = []
        idx = 4
        
        for word in word_list:
            for char in self.tokenizer(word):
                if char not in char_list:
                    char_list.append(char)
                    self.stoi[char] = idx
                    self.itos[idx] = char
                    idx+=1
                    
                    
    def numericalize(self, text):
        tokenized_text = self.tokenizer(text)
        
        return [self.stoi[token] if token in self.stoi else self.stoi["<UNK>"] for token in tokenized_text]
                 

In [3]:
class aksharantar(Dataset):
    """
    This class used to process text data for a machine translation task.
    root_dir: the root directory where the data is stored
    out_lang: the target language for translation 
    dataset_type: either "train", "test", or "val" indicating which dataset is being used.
    After loadinf data __init__() builds the vocabulary for each language by adding all unique characters in 
    the language's text data to the corresponding Vocabulary object.
    The __getitem__() method takes an index and returns the numericalized form of the corresponding input 
    and output sentences.
    It tokenizes each sentence into characters and adds special start-of-word (<SOW>) and end-of-word (<EOW>) 
    tokens to the beginning and end of the numericalized output sentence.
    Finally, it returns PyTorch tensors of the numericalized input and output sentences.
    
    """
        
    def __init__(self, root_dir, out_lang, dataset_type): 
        
        # Read file
        self.file_name = out_lang + "_" + dataset_type + ".csv"
        self.file_dir = os.path.join(root_dir, out_lang, self.file_name)
        self.df = pd.read_csv(self.file_dir, names = ["latin", "hindi"])
        
        # Get columns of input and output language
        self.latin = self.df["latin"]
        self.hindi = self.df["hindi"]
        
        # Initialize vocabulary and build vocab
        self.vocab_eng = Vocabulary()
        self.vocab_eng.build_vocabulary(self.latin.tolist())
        
        # Initialize vocabulary and build vocab
        self.vocab_hin = Vocabulary()
        self.vocab_hin.build_vocabulary(self.hindi.tolist())
        
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        latin = self.latin[index]
        hindi = self.hindi[index]
        
        numericalized_hindi = [self.vocab_hin.stoi["<SOW>"]]
        numericalized_hindi += self.vocab_hin.numericalize(hindi)
        numericalized_hindi.append(self.vocab_hin.stoi["<EOW>"])
        
        numericalized_latin = [self.vocab_eng.stoi["<SOW>"]]
        numericalized_latin += self.vocab_eng.numericalize(latin)
        numericalized_latin.append(self.vocab_eng.stoi["<EOW>"])
        
        return torch.tensor(numericalized_latin), torch.tensor(numericalized_hindi) 
               
        

In [4]:
class MyCollate:
    """
    This class is used to collate the data items into batches for DataLoader. 
    It takes two arguments, pad_idx_eng and pad_idx_hin, which are the index of the <PAD> token
    in the English and Hindi vocabularies respectively.
      
    """
    def __init__(self, pad_idx_eng, pad_idx_hin):
        self.pad_idx_eng = pad_idx_eng
        self.pad_idx_hin = pad_idx_hin
        
    def __call__(self, batch):
        inputs = [item[0] for item in batch]
        inputs = pad_sequence(inputs, batch_first=False, padding_value=self.pad_idx_eng)
        
        targets = [item[1] for item in batch]
        targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx_hin)
        
        return inputs, targets
        
        

In [5]:
def get_loader(root_dir, out_lang, dataset_type, batch_size, pin_memory=True ):
    """
    This class returns a PyTorch DataLoader object and a custom dataset object. 
    The DataLoader object loads the data in batches.
    
    """
    
    dataset = aksharantar(root_dir, out_lang, dataset_type)
    
    pad_idx_eng = dataset.vocab_eng.stoi["<PAD>"]
    pad_idx_hin = dataset.vocab_hin.stoi["<PAD>"]
    
    loader = DataLoader(dataset=dataset,batch_size=batch_size,
                       pin_memory=pin_memory,
                       collate_fn=MyCollate(pad_idx_eng=pad_idx_eng, pad_idx_hin=pad_idx_hin),
                       shuffle=True)
    return loader, dataset


## Getting the model Ready

In [6]:
class Encoder(nn.Module):
    """
    This code defines an Encoder class for a sequence-to-sequence model.
    The class takes in an input size, embedding size, hidden size, 
    number of layers, dropout rate, cell type (GRU, LSTM, or RNN), 
    and whether the network is bidirectional. The forward method takes in 
    an input tensor x, applies dropout to its embedded representation, and 
    passes it through a GRU, LSTM, or RNN layer depending on the cell type specified. 
    The final hidden states of the layer(s) are returned.
    
    """
    #input_size represents the dimensionality of the 
    #encoder's input space, indicating the number of possible input tokens or
    #categories that the coder can generate.
    
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p, cell_type, bi_directional):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.cell_type = cell_type
        self.dropout = nn.Dropout(p)
            
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.gru = nn.GRU(embedding_size, hidden_size, num_layers, dropout=p, bidirectional=bi_directional)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p,bidirectional=bi_directional)
        self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, dropout=p,bidirectional=bi_directional)

    def forward(self, x):
        # x, shape=(seq_length, N)
        embedding = self.dropout(self.embedding(x))
        # embedding shape = (seq_length, N,embedding_size )
        
        if self.cell_type == 'gru':
            encoder_states, hidden = self.gru(embedding)
            return encoder_states, hidden
        
        if self.cell_type == 'lstm':
            encoder_states, (hidden, cell) = self.lstm(embedding)
            return encoder_states, hidden, cell
        
        if self.cell_type == 'rnn':
            encoder_states, hidden = self.rnn(embedding)
            return encoder_states, hidden
        

In [7]:
class Decoder(nn.Module):
    """
    This code defines the Decoder class, which is responsible for decoding the encoded input sequence
    and generating the target sequence. 
    The method first unsqueezes x to add a batch dimension and then applies dropout to the embedding layer. 
    It then passes the embedded input sequence through the decoder's RNN layer, 
    which can be either GRU, LSTM, or RNN.
    Then passes the output through a linear layer to get the predictions, which are returned 
    along with the hidden and cell states.
    Finally, the method squeezes the predictions tensor to remove the batch dimension before returning 
    the predictions and hidden/cell states.
    
    """
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers,
                 p, cell_type, bi_directional ):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.cell_type = cell_type
        self.dropout = nn.Dropout(p)
        self.fc_hidden = nn.Linear(2*hidden_size, hidden_size)
        
        if bi_directional: # correct
            self.energy = nn.Linear(hidden_size * 3, 1)
        else:
            self.energy = nn.Linear(hidden_size * 2, 1) 
            
        self.softmax = nn.Softmax(dim=0)
        self.relu = nn.ReLU()
        
        if bi_directional:
            self.embedding = nn.Embedding(input_size, embedding_size)
        else:
            self.embedding = nn.Embedding(input_size, embedding_size*2)
        
        if bi_directional:
            self.gru = nn.GRU(hidden_size * 2 + embedding_size, hidden_size, num_layers, 
                              dropout=p,bidirectional=bi_directional )
        else:
            self.gru = nn.GRU(3*embedding_size, hidden_size, num_layers, dropout=p,bidirectional=bi_directional )
            
        if bi_directional: # correct
            self.lstm = nn.LSTM(hidden_size * 2 + embedding_size, hidden_size,num_layers,
                                dropout=p, bidirectional=bi_directional)
        else:
            self.lstm = nn.LSTM(3* embedding_size, hidden_size,num_layers, dropout=p, bidirectional=bi_directional)
         
        if bi_directional:
            self.rnn = nn.RNN(hidden_size * 2 + embedding_size, hidden_size,num_layers,
                              dropout=p, bidirectional=bi_directional)
        else:
            self.rnn = nn.RNN(3*embedding_size, hidden_size,num_layers, dropout=p, bidirectional=bi_directional)
            
        if bi_directional: # correct
            self.fc = nn.Linear(2*hidden_size, output_size)
        else:
            self.fc = nn.Linear(hidden_size, output_size)        
        
    def forward(self, x, encoder_states, hidden, cell):
        # x, shape=(N) but we want (1, N)
        x = x.unsqueeze(0)
        
        embedding = self.dropout(self.embedding(x))
        # embedding shape = (1, N,embedding_size )
        
        sequence_length = encoder_states.shape[0]
        hidden1 = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
        h_reshaped = hidden1.repeat(sequence_length, 1, 1)
        
        energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))
        
        attention = self.softmax(energy)
        # attention: (seq_length, N, 1)
        
        context_vector = torch.einsum("snk,snl->knl", attention, encoder_states)

        rnn_input = torch.cat((context_vector, embedding), dim=2)
        # rnn_input: (1, N, hidden_size*2 + embedding_size)
        
        if self.cell_type == 'gru':
            outputs, hidden = self.gru(rnn_input, hidden)
            #shape of output (1,N,hidden_size)
            
        if self.cell_type == 'lstm':
            outputs, (hidden, cell) = self.lstm(rnn_input, (hidden, cell))
            
        if self.cell_type == 'rnn':
            outputs, hidden = self.rnn(rnn_input, hidden)
            
        predictions = self.fc(outputs).squeeze(0)
        # shape of predictions = (1, N, length_of_vocabs)
        
        
        if self.cell_type == 'lstm':
            return predictions, hidden, cell
        else:
            return predictions, hidden
    

In [8]:
class Seq2Seq(nn.Module):
    
    """
    This class have functions which takes words as input and target words to find the 
    predictions using the model build in the forward function.
    This function uses the encoder and decoder formed earlier.
    
    """
    def __init__(self, encoder, decoder, cell_type):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.cell_type = cell_type
        
    def forward(self, word_input, word_target, teacher_force_ratio=0.5):
        
        batch_size = word_input.shape[1]
        target_length = word_target.shape[0]
        
        outputs = torch.zeros(target_length, batch_size, len(train_data.vocab_hin)).to(device)
        
        if self.cell_type == 'lstm':
            encoder_states, hidden, cell = self.encoder(word_input)
        else:
            encoder_states, hidden = self.encoder(word_input)
            
        # grab start token
        x= word_target[0]
        
        for t in range(1, target_length):
            if self.cell_type == "lstm":
                output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)
            else:
                output, hidden = self.decoder(x, encoder_states, hidden, 0)
                
            outputs[t] = output
            
            best_pred = output.argmax(1)
            
            x = word_target[t] if random.random() < teacher_force_ratio else best_pred
            
        return outputs
    

## Functions to find accuracy and print and save outputs

In [9]:
def predict(model, input_list, cell_type, max_length=30):
    
    '''
    The purpose of this function is to accept a list of characters in the input 
    language and then provide a list of characters in the output language.
    cell_type: to use which among lstm, rnn or gru cell
    max_length: The maximum length of latin input.
    
    '''
    
    # Making the indexes of the input according to the training data vocabulary
    # Because the index2str dicts of train data and val/test datasets are diffent
    
    input_word = [train_data.vocab_eng.stoi[char] for char in input_list]
    input_word = torch.LongTensor(input_word)

    # Input word is of shape (seq_length) but we want it to be (seq_length, 1) where 1 represents batch size
    input_word = input_word.view(input_word.shape[0],1).to(device)
    
    start_token_index = 1
    end_token_index = 2
   
    # Encoder
    with torch.no_grad():
        if model.cell_type == "lstm":
            encoder_states, hidden, cell = model.encoder(input_word)
        else:
            encoder_states, hidden = model.encoder(input_word)
    
    # Add start token to outputs
    outputs = [start_token_index]

    for _ in range(max_length):
        prev_char = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            if model.cell_type == "lstm":
                output, hidden, cell = model.decoder(prev_char, encoder_states, hidden, cell)
            else:
                output, hidden = model.decoder(prev_char, encoder_states, hidden, 0)
            
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == end_token_index:
            break
    
    # Convert outputs to character list
    prediction = [train_data.vocab_hin.itos[index] for index in outputs]
    
    return prediction


In [10]:
def calculate_accuracy(model, dataset, cell_type):
    
    """
    This function will comapre the prediction given by the predict function and the target output.
    I will do word by word, so may take little more time.
    
    """
    # Initializing the count
    correct_count = 0
    # Number of data in our dataset
    words_count = len(dataset)
    
    for i in range(words_count):
        
        char_input = [dataset.vocab_eng.itos[index] for index in dataset[i][0].tolist()]
        prediction = predict(model, char_input, cell_type)
        actual_word = [dataset.vocab_hin.itos[index] for index in dataset[i][1].tolist()]
        if prediction == actual_word:
            correct_count+=1
            
    return 100*(correct_count/words_count)

In [11]:
def  prediction_csv(model, dataset, cell_type):
    
    """
    This function will create the csv file having 3 columns namely Input(words),
    prediction and target. 
    model: Trained model whose accuracy to be seen for transliteration task.
    
    """
    # Initializing the count
    correct_count = 0
    # Number of data in our dataset
    words_count = len(dataset)
    # Initializing list to store lists, to save in csv file
    list_of_words = []
    
    for i in range(words_count):
        list1 = []
        char_input = [dataset.vocab_eng.itos[index] for index in dataset[i][0].tolist()]
        input_string = ''.join(char_input[1:len(char_input)-1])
        list1.append(input_string)
        prediction = predict(model, char_input, cell_type)
        pred_string = ''.join(prediction[1:len(prediction)-1])
        list1.append(pred_string)
        actual_word = [dataset.vocab_hin.itos[index] for index in dataset[i][1].tolist()]
        target_string = ''.join(actual_word[1:len(actual_word)-1])
        list1.append(target_string)
        list_of_words.append(list1)
        if prediction == actual_word:
            correct_count+=1
    
    # Creating the csv file in writing mode to write values stored in list_of_words
    with open('predictions_attention.csv',mode='w', encoding='utf-8', newline='') as file:
        writer = csv.writer(file)
    
        header = ["Inputs", "output","Target"]
    
        # Write header row
        writer.writerow(header)
        
        for i in range(words_count):
            writer.writerow(list_of_words[i])
            
    return 100*(correct_count/words_count)

In [13]:
def train(num_epochs, learning_rate, batch_size, load_model, 
         input_size_encoder, input_size_decoder, output_size,
         encoder_embedding_size, decoder_embedding_size,
         hidden_size, num_layers, enc_dropout, de_dropout):
    
    """
    This function is created to train the Seq2Seq model manually(without wandb).
    It takes the all the arguments needed for the encoder, decoder and Seq2seq model.
    Using this function we can test our model on test dataset, just uncomment the relevant line 
    commented in the lower part of the code.
    We can also generate prediction_vanilla csv file just by uncomment the 
    second last commented part of this code.
    We can also print the prediction by uncommenting the last part
    
    """
   
    # Importing the Encoder class
    encoder_net = Encoder(input_size_encoder, encoder_embedding_size,
                         hidden_size, num_layers, enc_dropout, cell_type,
                          bi_directional).to(device)
    
    # Importing the Decoder class
    decoder_net = Decoder(input_size_decoder, decoder_embedding_size,
                         hidden_size, output_size, num_layers, dec_dropout, 
                          cell_type ,bi_directional).to(device)
    
    # Preparing the model
    model = Seq2Seq(encoder_net, decoder_net, cell_type).to(device)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    pad_index = 0
    criterion = nn.CrossEntropyLoss(ignore_index=pad_index)
    
    print("Training the model.....")
    if load_model:
        load_checkpoint(torch.load('my_checkpoint.pth.ptar'),model, optimizer)

    for epoch in range(num_epochs):
        print('Epoch: ', epoch+1, '/', num_epochs)
        
        
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            
            input_word = inputs.to(device)
            target_word = targets.to(device)

            output = model(input_word, target_word)
            # output shape: (target_len, batch_size, output_vocab_size)
            
            output = output[1:].reshape(-1, output.shape[2])
            target_word = target_word[1:].reshape(-1)

            optimizer.zero_grad()
            loss = criterion(output, target_word)

            loss.backward()

            # To handle large gradients:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            optimizer.step()
            
        print("Training Loss: ", loss.item())  
        
        model.eval()
        print("finding the accuracy of the model.....")
        train_accu =  calculate_accuracy(model, train_data, cell_type)
        valid_accu = calculate_accuracy(model, valid_data, cell_type)
        model.train()

        print("valid accuracy:", valid_accu)
        print("train accuracy:", train_accu)
        
        


In [14]:
# # Data Uploading
# You can change the directory according to your data location
# out_lang: Choose which output language you want transliteration.
# 'hin':Hindi, 'urd':Urdu, 'tel':Telgu etc
root_dir = r'C:\Users\HICLIPS-ASK\aksharantar_sampled'
out_lang = 'hin'
batch_size = 64
train_loader, train_data = get_loader(root_dir, out_lang, 'train', batch_size=batch_size, pin_memory=True )
valid_loader, valid_data = get_loader(root_dir, out_lang, 'valid', batch_size=batch_size, pin_memory=True)
test_loader, test_data = get_loader(root_dir, out_lang, 'test', batch_size=batch_size, pin_memory=True)

In [15]:
#wandb_sweep = False

In [17]:
# To run manually Uncomment the above line 'wandb_sweep = False'
if wandb_sweep == False:
    ## Giving the argument values for manual training
    num_epochs = 1
    learning_rate = 0.001
    load_model = False
    input_size_encoder = len(train_data.vocab_eng)
    input_size_decoder = len(train_data.vocab_hin)
    output_size = len(train_data.vocab_hin)
    encoder_embedding_size = 256
    decoder_embedding_size = 256
    hidden_size = 256
    num_layers = 2
    enc_dropout = 0.2
    dec_dropout = 0.2
    cell_type = 'lstm'
    bi_directional = True

    ## Training the model
    train(num_epochs, learning_rate, batch_size, load_model, 
             input_size_encoder, input_size_decoder, output_size,
             encoder_embedding_size, decoder_embedding_size,
             hidden_size, num_layers, enc_dropout, dec_dropout)

## Training with Wandb_sweep

In [13]:
project_name = "Assignment 3 with attention"
entity_name = "am22s020"
import wandb

In [16]:
def train_with_wandb():


    config_defaults = {"cell_type": "lstm",
                       "num_layers": 4,
                       "hidden_size": 256,
                       "num_epochs":10,
                       "dropout": 0.2,
                       "embed_size":256
                      } 

    wandb.init(config=config_defaults, project=project_name, resume=False)
    
    config = wandb.config 
    
    
    learning_rate = 0.001
    load_model = False
    num_epochs = config.num_epochs
    encoder_embedding_size = config.embed_size
    decoder_embedding_size = config.embed_size
    input_size_encoder = len(train_data.vocab_eng)
    input_size_decoder = len(train_data.vocab_hin)
    output_size = len(train_data.vocab_hin)
    hidden_size = config.hidden_size
    num_layers = config.num_layers
    enc_dropout = config.dropout
    dec_dropout = config.dropout
    cell_type = config.cell_type
    bi_directional = True

    wandb.run.name  = "cell_{}_nl_{}_hs_{}_e_{}_dr_{}_ems_{}".format(cell_type,
                                                                      num_layers,
                                                                      hidden_size,
                                                                      num_epochs,
                                                                      enc_dropout,
                                                                      encoder_embedding_size
                                                                      )
                                                                              
                                                                                  
  
    print(wandb.run.name )
    
    encoder_net = Encoder(input_size_encoder, encoder_embedding_size,
                         hidden_size, num_layers, enc_dropout, cell_type,
                          bi_directional).to(device)

    decoder_net = Decoder(input_size_decoder, decoder_embedding_size,
                         hidden_size, output_size, num_layers, dec_dropout, 
                          cell_type ,bi_directional).to(device)

    model = Seq2Seq(encoder_net, decoder_net, cell_type).to(device)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    pad_index = 0
    criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

    if load_model:
        load_checkpoint(torch.load('my_checkpoint.pth.ptar'),model, optimizer)

    for epoch in range(num_epochs):
        print('Epoch: ', epoch+1, '/', num_epochs)

        for batch_idx, (inputs, targets) in enumerate(train_loader):

            input_word = inputs.to(device)
            target_word = targets.to(device)

            output = model(input_word, target_word)
            # output shape: (target_len, batch_size, output_vocab_size)

            output = output[1:].reshape(-1, output.shape[2])
            target_word = target_word[1:].reshape(-1)

            optimizer.zero_grad()
            loss = criterion(output, target_word)

            loss.backward()

            # To handle large gradients:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            optimizer.step()

        print("Training Loss: ", loss.item())
        
        train_loss = loss.item()

        model.eval()
        train_accu =  calculate_accuracy(model, train_data, cell_type)
        valid_accu = calculate_accuracy(model, valid_data, cell_type)
        model.train()

        wandb.log({"valid accuracy": valid_accu, "train accuracy": train_accu,
                    "train loss": train_loss, 'epoch': epoch+1})
    
    
    wandb.run.finish()
    

In [15]:
hyperparameters = {

        "num_layers": {
            "values": [2, 3, 4]
        },
        "hidden_size": {
            "values": [64, 128, 256]
        },
        "cell_type": {
            "values": ["rnn", "gru", "lstm"]
        },
        "num_epochs":{
            "values": [10, 20]
        },
        "dropout": {
            "values": [0.2, 0.3, 0.5]
        },
        "embed_size":{
            "values": [64, 128, 256]
        },
  }


In [16]:
def wandb_sweep(project_name, entity_name):
    '''
    This function is used to run the wandb sweeps. 
    It takes in project name and entity name as input , and does not return any value.

    '''
    sweep_config={

      "method": "bayes",
      "metric": {
          "name": "valid_accu", 
          "goal": "maximize"
          },
      "parameters":hyperparameters
    }

    sweep_id=wandb.sweep(sweep_config, project=project_name, entity=entity_name)
    wandb.agent(sweep_id,train_with_wandb)

In [None]:
if wandb_sweep == True:
    wandb_sweep(project_name, entity_name)

Create sweep with ID: 6imepp9f
Sweep URL: https://wandb.ai/am22s020/Assignment%203%20with%20attention/sweeps/6imepp9f


[34m[1mwandb[0m: Agent Starting Run: gh6x48rw with config:
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	num_epochs: 20
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: Currently logged in as: [33mam22s020[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333333327028, max=1.0…

cell_gru_nl_2_hs_128_e_20_dr_0.5_ems_64
Epoch:  1 / 20
Training Loss:  1.4165784120559692
Epoch:  2 / 20
Training Loss:  1.2466036081314087
Epoch:  3 / 20
Training Loss:  1.327499508857727
Epoch:  4 / 20
Training Loss:  0.8975704908370972
Epoch:  5 / 20
Training Loss:  0.9984159469604492
Epoch:  6 / 20
Training Loss:  0.9612013697624207
Epoch:  7 / 20
Training Loss:  0.8561251759529114
Epoch:  8 / 20
Training Loss:  0.7542168498039246
Epoch:  9 / 20
Training Loss:  0.8064953088760376
Epoch:  10 / 20
Training Loss:  0.9886434078216553
Epoch:  11 / 20
Training Loss:  1.2049074172973633
Epoch:  12 / 20
Training Loss:  0.6419394612312317
Epoch:  13 / 20
Training Loss:  0.7579282522201538
Epoch:  14 / 20
Training Loss:  0.8015043139457703
Epoch:  15 / 20
Training Loss:  0.7942507266998291
Epoch:  16 / 20
Training Loss:  0.9451488256454468
Epoch:  17 / 20
Training Loss:  0.792537271976471
Epoch:  18 / 20
Training Loss:  0.6033579111099243
Epoch:  19 / 20
Training Loss:  0.5872355103492737
Ep

VBox(children=(Label(value='0.000 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.0, max…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train accuracy,▁▄▅▆▆▇▇███
train loss,█▇▇▄▄▄▃▂▃▄▆▁▂▃▃▄▃▁▁▁
valid accuracy,▁▅▆▇▇▇████

0,1
epoch,20.0
train accuracy,36.08789
train loss,0.64199
valid accuracy,33.64258


[34m[1mwandb[0m: Agent Starting Run: zsgqymav with config:
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_size: 128
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_layers: 3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

cell_lstm_nl_3_hs_64_e_10_dr_0.2_ems_128
Epoch:  1 / 10
Training Loss:  1.74724280834198
Epoch:  2 / 10
Training Loss:  1.2386507987976074
Epoch:  3 / 10
Training Loss:  1.089850664138794
Epoch:  4 / 10
Training Loss:  1.1127513647079468
Epoch:  5 / 10
Training Loss:  1.0103193521499634
Epoch:  6 / 10
Training Loss:  0.9446633458137512
Epoch:  7 / 10
Training Loss:  0.9373942613601685
Epoch:  8 / 10
Training Loss:  0.8420271873474121
Epoch:  9 / 10
Training Loss:  0.7148458957672119
Epoch:  10 / 10
Training Loss:  0.7802539467811584


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train accuracy,▁▄▆▇█
train loss,█▅▄▄▃▃▃▂▁▁
valid accuracy,▁▅▆▇█

0,1
epoch,10.0
train accuracy,29.87305
train loss,0.78025
valid accuracy,30.83496


[34m[1mwandb[0m: Agent Starting Run: qeojpxmu with config:
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_size: 256
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	num_epochs: 20
[34m[1mwandb[0m: 	num_layers: 4


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333333327028, max=1.0…

cell_gru_nl_4_hs_128_e_20_dr_0.2_ems_256
Epoch:  1 / 20
Training Loss:  1.131693720817566
Epoch:  2 / 20
Training Loss:  0.7396837472915649
Epoch:  3 / 20
Training Loss:  0.8153654932975769
Epoch:  4 / 20
Training Loss:  1.2250787019729614
Epoch:  5 / 20
Training Loss:  0.6277437806129456
Epoch:  6 / 20
Training Loss:  0.6340178847312927
Epoch:  7 / 20
Training Loss:  0.6432211995124817
Epoch:  8 / 20
Training Loss:  0.7892532348632812
Epoch:  9 / 20
Training Loss:  0.6260433197021484
Epoch:  10 / 20
Training Loss:  0.6650139093399048
Epoch:  11 / 20
Training Loss:  0.5047571659088135
Epoch:  12 / 20
Training Loss:  0.501890242099762
Epoch:  13 / 20
Training Loss:  0.6220911145210266
Epoch:  14 / 20
Training Loss:  0.5953448414802551
Epoch:  15 / 20
Training Loss:  0.5717355012893677
Epoch:  16 / 20
Training Loss:  0.6496957540512085
Epoch:  17 / 20
Training Loss:  0.48912838101387024
Epoch:  18 / 20
Training Loss:  0.40190964937210083
Epoch:  19 / 20
Training Loss:  0.5845766663551331

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train accuracy,▁▄▅▅▇▇▇▇▇█
train loss,▇▄▅█▃▃▃▄▃▃▂▂▃▃▂▃▂▁▃▂
valid accuracy,▁▆▇▆███▇▇█

0,1
epoch,20.0
train accuracy,44.19922
train loss,0.53221
valid accuracy,31.54297


[34m[1mwandb[0m: Agent Starting Run: jpoxrrfg with config:
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_layers: 3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333333327028, max=1.0…

cell_rnn_nl_3_hs_64_e_10_dr_0.3_ems_64
Epoch:  1 / 10
Training Loss:  2.2788312435150146
Epoch:  2 / 10
Training Loss:  1.6987394094467163


0,1
epoch,▁
train loss,▁

0,1
epoch,1.0
train loss,2.27883


[34m[1mwandb[0m: Agent Starting Run: 5yx48xcm with config:
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_size: 256
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_layers: 3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333333327028, max=1.0…

cell_gru_nl_3_hs_256_e_10_dr_0.2_ems_256
Epoch:  1 / 10
Training Loss:  0.8902482986450195
Epoch:  2 / 10
Training Loss:  0.6429905295372009
Epoch:  3 / 10
Training Loss:  0.5765307545661926
Epoch:  4 / 10
Training Loss:  0.6747865080833435
Epoch:  5 / 10
Training Loss:  0.682616651058197
Epoch:  6 / 10
Training Loss:  0.5456714034080505
Epoch:  7 / 10
Training Loss:  0.6132667064666748
Epoch:  8 / 10
Training Loss:  0.4423506259918213
Epoch:  9 / 10
Training Loss:  0.4361552298069
Epoch:  10 / 10
Training Loss:  0.3670375943183899


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train accuracy,▁▅▆▇█
train loss,█▅▄▅▅▃▄▂▂▁
valid accuracy,▁▇▇██

0,1
epoch,10.0
train accuracy,46.83203
train loss,0.36704
valid accuracy,35.00977


[34m[1mwandb[0m: Agent Starting Run: dlyprlmt with config:
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embed_size: 128
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	num_epochs: 20
[34m[1mwandb[0m: 	num_layers: 3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333333327028, max=1.0…

cell_lstm_nl_3_hs_128_e_20_dr_0.5_ems_128
Epoch:  1 / 20
Training Loss:  1.584213137626648
Epoch:  2 / 20
Training Loss:  1.366572380065918
Epoch:  3 / 20
Training Loss:  1.0190939903259277
Epoch:  4 / 20
Training Loss:  0.9300316572189331
Epoch:  5 / 20
Training Loss:  0.9060571193695068
Epoch:  6 / 20
Training Loss:  0.7805853486061096
Epoch:  7 / 20
Training Loss:  0.6091936230659485
Epoch:  8 / 20
Training Loss:  0.7792050838470459
Epoch:  9 / 20
Training Loss:  0.6729950308799744
Epoch:  10 / 20
Training Loss:  0.7896996736526489
Epoch:  11 / 20
Training Loss:  0.5015790462493896
Epoch:  12 / 20
Training Loss:  0.8972135186195374
Epoch:  13 / 20
Training Loss:  0.6439371109008789
Epoch:  14 / 20
Training Loss:  0.6449766755104065
Epoch:  15 / 20
Training Loss:  0.6032648682594299
Epoch:  16 / 20
Training Loss:  0.48681697249412537
Epoch:  17 / 20
Training Loss:  0.7367228269577026
Epoch:  18 / 20
Training Loss:  0.5495006442070007
Epoch:  19 / 20
Training Loss:  0.5323366522789001

VBox(children=(Label(value='0.000 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.0, max…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train accuracy,▁▃▄▅▆▆▇▇██
train loss,█▇▅▄▄▃▂▃▃▃▂▄▃▃▂▂▃▂▂▁
valid accuracy,▁▄▅▇▇▇▇███

0,1
epoch,20.0
train accuracy,45.79297
train loss,0.38479
valid accuracy,37.89062


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: xd1bqqnr with config:
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_layers: 4


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

cell_lstm_nl_4_hs_256_e_10_dr_0.3_ems_64
Epoch:  1 / 10
Training Loss:  1.4884495735168457
Epoch:  2 / 10
Training Loss:  0.728722333908081
Epoch:  3 / 10
Training Loss:  0.6766051054000854
Epoch:  4 / 10
Training Loss:  0.6131076216697693
Epoch:  5 / 10
Training Loss:  0.39085397124290466
Epoch:  6 / 10
Training Loss:  0.5453477501869202
Epoch:  7 / 10
Training Loss:  0.4374116361141205
Epoch:  8 / 10
Training Loss:  0.545535147190094
Epoch:  9 / 10
Training Loss:  0.3848966658115387
Epoch:  10 / 10
Training Loss:  0.563019871711731


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train accuracy,▁▄▅▇█
train loss,█▃▃▂▁▂▁▂▁▂
valid accuracy,▁▅▇██

0,1
epoch,10.0
train accuracy,58.18945
train loss,0.56302
valid accuracy,41.40625


[34m[1mwandb[0m: Agent Starting Run: kdu2vnf2 with config:
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_size: 128
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_epochs: 20
[34m[1mwandb[0m: 	num_layers: 4


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

cell_rnn_nl_4_hs_256_e_20_dr_0.2_ems_128
Epoch:  1 / 20
Training Loss:  1.3642313480377197
Epoch:  2 / 20
Training Loss:  1.2878751754760742
Epoch:  3 / 20
Training Loss:  1.0652996301651
Epoch:  4 / 20
Training Loss:  1.0087907314300537
Epoch:  5 / 20
Training Loss:  1.0444343090057373
Epoch:  6 / 20
Training Loss:  0.9274753332138062
Epoch:  7 / 20
Training Loss:  1.1034103631973267
Epoch:  8 / 20
Training Loss:  1.286133885383606
Epoch:  9 / 20
Training Loss:  0.9175023436546326
Epoch:  10 / 20
Training Loss:  0.943825900554657
Epoch:  11 / 20
Training Loss:  0.8577011227607727
Epoch:  12 / 20
Training Loss:  0.9621949195861816
Epoch:  13 / 20
Training Loss:  1.066468596458435
Epoch:  14 / 20
Training Loss:  0.6589992642402649
Epoch:  15 / 20
Training Loss:  0.9069973826408386
Epoch:  16 / 20
Training Loss:  0.915599524974823
Epoch:  17 / 20
Training Loss:  0.8813110589981079
Epoch:  18 / 20
Training Loss:  0.7655790448188782
Epoch:  19 / 20
Training Loss:  1.2116597890853882
Epoch:

VBox(children=(Label(value='0.000 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.0, max…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train accuracy,▁▂▇▇▇█▇██▆
train loss,█▇▅▄▅▄▅▇▄▄▃▄▅▁▃▄▃▂▆▄
valid accuracy,▁▂▇▆▆▇▆██▆

0,1
epoch,20.0
train accuracy,7.30859
train loss,0.98767
valid accuracy,8.59375


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: qg3khhqc with config:
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	num_epochs: 20
[34m[1mwandb[0m: 	num_layers: 3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333333266395, max=1.0…

cell_lstm_nl_3_hs_128_e_20_dr_0.2_ems_64
Epoch:  1 / 20
Training Loss:  1.2838928699493408
Epoch:  2 / 20
Training Loss:  1.0000604391098022
Epoch:  3 / 20
Training Loss:  0.8304522037506104
Epoch:  4 / 20
Training Loss:  0.7251396179199219
Epoch:  5 / 20
Training Loss:  0.7191965579986572
Epoch:  6 / 20
Training Loss:  0.7313385009765625
Epoch:  7 / 20
Training Loss:  0.562213659286499
Epoch:  8 / 20
Training Loss:  0.4478086531162262
Epoch:  9 / 20
Training Loss:  0.4061241149902344
Epoch:  10 / 20
Training Loss:  0.6966181993484497
Epoch:  11 / 20
Training Loss:  0.48672470450401306
Epoch:  12 / 20
Training Loss:  0.4706825613975525
Epoch:  13 / 20
Training Loss:  0.5229274034500122
Epoch:  14 / 20
Training Loss:  0.38754552602767944
Epoch:  15 / 20
Training Loss:  0.45503130555152893
Epoch:  16 / 20
Training Loss:  0.4202267825603485
Epoch:  17 / 20
Training Loss:  0.39057812094688416
Epoch:  18 / 20
Training Loss:  0.3519270718097687
Epoch:  19 / 20
Training Loss:  0.5190706253051

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train accuracy,▁▃▄▅▅▆▇▇██
train loss,█▆▅▄▄▄▃▂▁▄▂▂▂▁▂▂▁▁▂▂
valid accuracy,▁▅▆▇▇▇████

0,1
epoch,20.0
train accuracy,62.2168
train loss,0.51113
valid accuracy,38.40332


[34m[1mwandb[0m: Agent Starting Run: pw42aw9r with config:
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_size: 256
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_layers: 2


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

cell_rnn_nl_2_hs_64_e_10_dr_0.3_ems_256
Epoch:  1 / 10
Training Loss:  1.9965745210647583
Epoch:  2 / 10
Training Loss:  1.6847649812698364
Epoch:  3 / 10
Training Loss:  1.5153617858886719
Epoch:  4 / 10
Training Loss:  1.3774102926254272
Epoch:  5 / 10
Training Loss:  1.5302456617355347
Epoch:  6 / 10
Training Loss:  1.3955045938491821
Epoch:  7 / 10
Training Loss:  1.425681710243225
Epoch:  8 / 10
Training Loss:  1.3953337669372559
Epoch:  9 / 10
Training Loss:  1.128304362297058
Epoch:  10 / 10
Training Loss:  1.377143144607544


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train accuracy,▁▅▅▆█
train loss,█▅▄▃▄▃▃▃▁▃
valid accuracy,▁▆▆▆█

0,1
epoch,10.0
train accuracy,5.75
train loss,1.37714
valid accuracy,8.1543


[34m[1mwandb[0m: Agent Starting Run: bpmldnyd with config:
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_size: 128
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	num_epochs: 20
[34m[1mwandb[0m: 	num_layers: 3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333338766, max=1.0)…

cell_rnn_nl_3_hs_64_e_20_dr_0.2_ems_128
Epoch:  1 / 20
Training Loss:  1.762741208076477
Epoch:  2 / 20
Training Loss:  1.3755539655685425
Epoch:  3 / 20
Training Loss:  1.1074259281158447
Epoch:  4 / 20
Training Loss:  1.3761682510375977
Epoch:  5 / 20
Training Loss:  1.3881056308746338
Epoch:  6 / 20
Training Loss:  1.1670186519622803
Epoch:  7 / 20
Training Loss:  1.1120853424072266
Epoch:  8 / 20
Training Loss:  1.1046708822250366
Epoch:  9 / 20
Training Loss:  1.1318095922470093
Epoch:  10 / 20
Training Loss:  1.0630854368209839
Epoch:  11 / 20
Training Loss:  1.057728886604309
Epoch:  12 / 20
Training Loss:  1.0657644271850586
Epoch:  13 / 20
Training Loss:  0.8444045186042786
Epoch:  14 / 20
Training Loss:  1.0681583881378174
Epoch:  15 / 20
Training Loss:  1.0240377187728882
Epoch:  16 / 20
Training Loss:  1.0087336301803589
Epoch:  17 / 20
Training Loss:  1.128905177116394
Epoch:  18 / 20
Training Loss:  1.1534079313278198
Epoch:  19 / 20
Training Loss:  1.065372109413147
Epoc

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train accuracy,▁▃▄▅▆▆▇▇▇█
train loss,█▅▃▅▅▃▃▃▃▃▃▃▁▃▂▂▃▃▃▃
valid accuracy,▁▄▅▆▇▆▇▇▇█

0,1
epoch,20.0
train accuracy,14.18164
train loss,1.08883
valid accuracy,17.04102


[34m[1mwandb[0m: Agent Starting Run: 925yk244 with config:
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_size: 256
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_layers: 4


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

cell_gru_nl_4_hs_128_e_10_dr_0.2_ems_256
Epoch:  1 / 10
Training Loss:  1.128707766532898
Epoch:  2 / 10
Training Loss:  0.8593323826789856
Epoch:  3 / 10
Training Loss:  1.2122695446014404
Epoch:  4 / 10
Training Loss:  0.9451954960823059
Epoch:  5 / 10
Training Loss:  0.8347623348236084
Epoch:  6 / 10
Training Loss:  0.8045775890350342
Epoch:  7 / 10
Training Loss:  0.6234214305877686
Epoch:  8 / 10
Training Loss:  0.511805534362793
Epoch:  9 / 10
Training Loss:  0.47723713517189026
Epoch:  10 / 10
Training Loss:  0.6181269288063049


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train accuracy,▁▄▆▇█
train loss,▇▅█▅▄▄▂▁▁▂
valid accuracy,▁▆▇██

0,1
epoch,10.0
train accuracy,41.7168
train loss,0.61813
valid accuracy,32.95898


[34m[1mwandb[0m: Agent Starting Run: x55k1cwc with config:
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_size: 64
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	num_epochs: 20
[34m[1mwandb[0m: 	num_layers: 3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

cell_rnn_nl_3_hs_64_e_20_dr_0.2_ems_64
Epoch:  1 / 20
Training Loss:  1.9479812383651733
Epoch:  2 / 20
Training Loss:  1.6802910566329956
Epoch:  3 / 20
Training Loss:  1.4481703042984009
Epoch:  4 / 20
Training Loss:  1.3073465824127197
Epoch:  5 / 20
Training Loss:  1.2452242374420166
Epoch:  6 / 20
Training Loss:  1.2877197265625
Epoch:  7 / 20
Training Loss:  1.2480745315551758
Epoch:  8 / 20
Training Loss:  1.1411949396133423
Epoch:  9 / 20
Training Loss:  1.1986076831817627
Epoch:  10 / 20
Training Loss:  1.0641812086105347
Epoch:  11 / 20
Training Loss:  0.9455357789993286
Epoch:  12 / 20
Training Loss:  1.1273552179336548
Epoch:  13 / 20
Training Loss:  1.0642919540405273
Epoch:  14 / 20
Training Loss:  0.9417949318885803
Epoch:  15 / 20
Training Loss:  1.0613768100738525
Epoch:  16 / 20
Training Loss:  1.3470232486724854
Epoch:  17 / 20
Training Loss:  1.0244343280792236
Epoch:  18 / 20
Training Loss:  0.9074632525444031
Epoch:  19 / 20
Training Loss:  1.0613924264907837
Epoc

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train accuracy,▁▃▃▄▅▅▆▆▇█
train loss,█▆▅▄▃▄▃▃▃▂▁▂▂▁▂▄▂▁▂▁
valid accuracy,▁▄▄▄▅▆▆▇▇█

0,1
epoch,20.0
train accuracy,16.30469
train loss,0.98172
valid accuracy,18.72559


[34m[1mwandb[0m: Agent Starting Run: 7008ngem with config:
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embed_size: 256
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_epochs: 20
[34m[1mwandb[0m: 	num_layers: 3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333333266395, max=1.0…

cell_gru_nl_3_hs_256_e_20_dr_0.5_ems_256
Epoch:  1 / 20
Training Loss:  1.042359471321106
Epoch:  2 / 20
Training Loss:  1.3073780536651611
Epoch:  3 / 20
Training Loss:  0.793432354927063
Epoch:  4 / 20
Training Loss:  0.8519611358642578
Epoch:  5 / 20
Training Loss:  0.9663528800010681
Epoch:  6 / 20
Training Loss:  0.8350262641906738
Epoch:  7 / 20
Training Loss:  0.821759045124054
Epoch:  8 / 20
Training Loss:  0.7729745507240295
Epoch:  9 / 20
Training Loss:  0.8288685083389282
Epoch:  10 / 20
Training Loss:  1.0298336744308472
Epoch:  11 / 20
Training Loss:  1.0461642742156982
Epoch:  12 / 20
Training Loss:  0.7668841481208801
Epoch:  13 / 20
Training Loss:  0.9732661247253418
Epoch:  14 / 20
Training Loss:  0.8840385675430298
Epoch:  15 / 20
Training Loss:  0.8165715336799622
Epoch:  16 / 20
Training Loss:  0.7950144410133362
Epoch:  17 / 20
Training Loss:  0.6820006966590881
Epoch:  18 / 20
Training Loss:  0.8555582761764526
Epoch:  19 / 20
Training Loss:  0.6922104954719543
Ep

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train accuracy,▁▅▆▇▇█▇██▇
train loss,▅█▂▃▄▃▃▂▃▅▅▂▄▃▃▂▁▃▁▁
valid accuracy,▁▅▇▇▇█▆█▇▇

0,1
epoch,20.0
train accuracy,31.95117
train loss,0.66797
valid accuracy,31.83594


[34m[1mwandb[0m: Agent Starting Run: 6l1s2ibn with config:
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embed_size: 256
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	num_epochs: 20
[34m[1mwandb[0m: 	num_layers: 3


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333333145128, max=1.0…

cell_rnn_nl_3_hs_64_e_20_dr_0.3_ems_256
Epoch:  1 / 20
Training Loss:  2.1629936695098877
Epoch:  2 / 20
Training Loss:  1.9090267419815063
Epoch:  3 / 20
Training Loss:  1.583527684211731
Epoch:  4 / 20
Training Loss:  1.4388577938079834
Epoch:  5 / 20
Training Loss:  1.5759265422821045
Epoch:  6 / 20
Training Loss:  1.4720187187194824
Epoch:  7 / 20
Training Loss:  1.425864338874817
Epoch:  8 / 20
Training Loss:  1.5961552858352661
Epoch:  9 / 20
Training Loss:  1.3534411191940308
Epoch:  10 / 20
Training Loss:  1.434489369392395
Epoch:  11 / 20
Training Loss:  1.1694105863571167
Epoch:  12 / 20
Training Loss:  1.2960999011993408
Epoch:  13 / 20
Training Loss:  1.7680671215057373
Epoch:  14 / 20
Training Loss:  1.1792243719100952
Epoch:  15 / 20
Training Loss:  1.3230642080307007
Epoch:  16 / 20
Training Loss:  1.110196828842163
Epoch:  17 / 20
Training Loss:  1.2973885536193848
Epoch:  18 / 20
Training Loss:  1.2906779050827026
Epoch:  19 / 20
Training Loss:  1.4375064373016357
Epoc

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train accuracy,▁▂▃▄▄▅▆▆▇█
train loss,█▆▄▃▄▃▃▄▃▃▁▂▅▁▂▁▂▂▃▁
valid accuracy,▁▃▃▅▄▅▆▅▆█

0,1
epoch,20.0
train accuracy,8.4043
train loss,1.17653
valid accuracy,11.40137


[34m[1mwandb[0m: Agent Starting Run: pscxszu9 with config:
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embed_size: 128
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	num_epochs: 20
[34m[1mwandb[0m: 	num_layers: 4


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

cell_gru_nl_4_hs_128_e_20_dr_0.5_ems_128
Epoch:  1 / 20
Training Loss:  1.6790268421173096
Epoch:  2 / 20
Training Loss:  1.2062088251113892
Epoch:  3 / 20
Training Loss:  1.4261326789855957
Epoch:  4 / 20
Training Loss:  1.0372810363769531
Epoch:  5 / 20
Training Loss:  1.0783706903457642
Epoch:  6 / 20
Training Loss:  0.801123321056366
Epoch:  7 / 20
Training Loss:  0.746178150177002
Epoch:  8 / 20
Training Loss:  0.8284363150596619
Epoch:  9 / 20
Training Loss:  0.9789810180664062
Epoch:  10 / 20
Training Loss:  0.992134153842926
Epoch:  11 / 20
Training Loss:  0.9993125796318054
Epoch:  12 / 20
Training Loss:  0.8070383071899414
Epoch:  13 / 20
Training Loss:  0.8044712543487549
Epoch:  14 / 20
Training Loss:  1.0868769884109497
Epoch:  15 / 20
Training Loss:  0.8012816309928894
Epoch:  16 / 20
Training Loss:  0.6489248275756836
Epoch:  17 / 20
Training Loss:  0.844541609287262
Epoch:  18 / 20
Training Loss:  1.019227385520935
Epoch:  19 / 20
Training Loss:  0.9667220115661621
Epoc

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train accuracy,▁▄▆▆▇▇▇██▇
train loss,█▅▆▄▄▂▂▂▃▃▃▂▂▄▂▁▂▄▃▃
valid accuracy,▁▄▇▇▇▇██▇▇

0,1
epoch,20.0
train accuracy,22.3125
train loss,0.89413
valid accuracy,21.65527


[34m[1mwandb[0m: Agent Starting Run: sx1c6d7s with config:
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_size: 128
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_layers: 3


cell_rnn_nl_3_hs_256_e_10_dr_0.2_ems_128
Epoch:  1 / 10
Training Loss:  1.2345741987228394
Epoch:  2 / 10
Training Loss:  1.2163000106811523
Epoch:  3 / 10
Training Loss:  0.9711766242980957
Epoch:  4 / 10
Training Loss:  1.2107219696044922
Epoch:  5 / 10
Training Loss:  0.813028872013092
Epoch:  6 / 10
Training Loss:  1.3652628660202026
