# Implement and train a LSTM for sentiment analysis

(General Hint on Lab 1/2: Trust whatever you see from the training and report it on PDF. IDMB is far from ideal as it's more like a real-world dataset)

## Step 0: set up the environment

In [4]:
import functools
import sys
import numpy as np
import pandas as pd
import random
import re
import matplotlib.pyplot as plt
import tqdm
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset

nltk.download('stopwords')

torch.backends.cudnn.benchmark = True

import os
os.makedirs("resources", exist_ok=True)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Hyperparameters. Do not directly touch this to mess up settings.

If you want to initalize new hyperparameter sets, use "new_hparams = HyperParams()" and change corresponding fields.

In [5]:
class HyperParams:
    def __init__(self):
        # Constance hyperparameters. They have been tested and don't need to be tuned.
        self.PAD_INDEX = 0
        self.UNK_INDEX = 1
        self.PAD_TOKEN = '<pad>'
        self.UNK_TOKEN = '<unk>'
        self.STOP_WORDS = set(stopwords.words('english'))
        self.MAX_LENGTH = 256
        self.BATCH_SIZE = 96
        self.EMBEDDING_DIM = 1
        self.HIDDEN_DIM = 100
        self.OUTPUT_DIM = 2
        self.N_LAYERS = 1
        self.DROPOUT_RATE = 0.0
        self.LR = 0.01
        self.N_EPOCHS = 5
        self.WD = 0
        self.OPTIM = "sgd"
        self.BIDIRECTIONAL = False
        self.SEED = 2

## Lab 1(a) Implement your own data loader function.  
First, you need to read the data from the dataset file on the local disk. 
Then, split the dataset into three sets: train, validation and test by 7:1:2 ratio.
Finally return x_train, x_valid, x_test, y_train, y_valid, y_test where x represents reviews and y represent labels.  

In [6]:
def load_imdb(base_csv:str = './IMDBDataset.csv'):
    """
    Load the IMDB dataset
    :param base_csv: the path of the dataset file.
    :return: train, validation and test set.
    """
    # Add your code here. 
    # print("hi")
    data = pd.read_csv(base_csv)
    # print(data.get("sentiment"))
    x_train, x_test, y_train, y_test = train_test_split(data.get("review"), data.get("sentiment"), test_size = 0.2, random_state=1, shuffle = False)

    x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.125, random_state=1, shuffle = False) 
    
    
    
    print(f'shape of train data is {x_train.shape}')
    print(f'shape of test data is {x_test.shape}')
    print(f'shape of valid data is {x_valid.shape}')
    return x_train, x_valid, x_test, y_train, y_valid, y_test


shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)


## Lab 1(b): Implement your function to build a vocabulary based on the training corpus.
Implement the build_vocab function to build a vocabulary based on the training corpus.
You should first compute the frequency of all the words in the training corpus. Remove the words
that are in the STOP_WORDS. Then filter the words by their frequency (≥ min_freq) and finally
generate a corpus variable that contains a list of words.

In [7]:
def build_vocab(x_train:list, min_freq: int=5, hparams=None) -> dict:
    """
    build a vocabulary based on the training corpus.
    :param x_train:  List. The training corpus. Each sample in the list is a string of text.
    :param min_freq: Int. The frequency threshold for selecting words.
    :return: dictionary {word:index}
    """
    # Add your code here. Your code should assign corpus with a list of words.
    corpus = {}
    for sentences in x_train:
      sentence = sentences.split(" ")
      for word in sentence:
        if word in hparams.STOP_WORDS:
          continue
        if word not in corpus:
          corpus[word] = 1
        else: 
          corpus[word] += 1

    corpus_ = [word for word, freq in corpus.items() if freq >= min_freq]

    # creating a dict
    vocab = {w:i+2 for i, w in enumerate(corpus_)}
    # print(vocab)

    vocab[hparams.PAD_TOKEN] = hparams.PAD_INDEX
    vocab[hparams.UNK_TOKEN] = hparams.UNK_INDEX
    return vocab


## Lab 1(c): Implement your tokenize function. 
For each word, find its index in the vocabulary. 
Return a list of int that represents the indices of words in the example. 

In [8]:
def tokenize(vocab: dict, example: str)-> list:
    """
    Tokenize the give example string into a list of token indices.
    :param vocab: dict, the vocabulary.
    :param example: a string of text.
    :return: a list of token indices.
    """
    # Your code here.
    token_ind = []
    example_arr = example.split(" ")
    for word in example_arr:
      if word in vocab:
        token_ind.append(vocab[word])
    return token_ind

## Lab 1 (d): Implement the __getitem__ function. Given an index i, you should return the i-th review and label. 
The review is originally a string. Please tokenize it into a sequence of token indices. 
Use the max_length parameter to truncate the sequence so that it contains at most max_length tokens. 
Convert the label string ('positive'/'negative') to a binary index. 'positive' is 1 and 'negative' is 0. 
Return a dictionary containing three keys: 'ids', 'length', 'label' which represent the list of token ids, the length of the sequence, the binary label. 

In [9]:
class IMDB(Dataset):
    def __init__(self, x, y, vocab, max_length=256) -> None:
        """
        :param x: list of reviews
        :param y: list of labels
        :param vocab: vocabulary dictionary {word:index}.
        :param max_length: the maximum sequence length.
        """
        self.x = x
        self.y = y
        self.vocab = vocab
        self.max_length = max_length

    def __getitem__(self, idx: int):
        """
        Return the tokenized review and label by the given index.
        :param idx: index of the sample.
        :return: a dictionary containing three keys: 'ids', 'length', 'label' which represent the list of token ids, the length of the sequence, the binary label.
        """
        # Add your code here.
        review = self.x.iloc[idx]
        token = tokenize(self.vocab, review)
        if len(token) >= self.max_length:
          token_list = token[:self.max_length]
        else:
          token_list = token
      
        #get the label
        label = self.y.iloc[idx]
        tag = 1 if label == "positive" else 0

        #create the dictionary
        item_dict = {}
        item_dict['ids'] = token_list
        item_dict['length'] = len(token_list)
        item_dict['label'] = tag

        return item_dict
        # pass
    

    def __len__(self) -> int:
        return len(self.x)

def collate(batch, pad_index):
    batch_ids = [torch.LongTensor(i['ids']) for i in batch]
    batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
    batch_length = torch.Tensor([i['length'] for i in batch])
    batch_label = torch.LongTensor([i['label'] for i in batch])
    batch = {'ids': batch_ids, 'length': batch_length, 'label': batch_label}
    return batch

collate_fn = collate

## Lab 1 (e): Implement the LSTM model for sentiment analysis.
Q(a): Implement the initialization function.
Your task is to create the model by stacking several necessary layers including an embedding layer, a lstm cell, a linear layer, and a dropout layer.
You can call functions from Pytorch's nn library. For example, nn.Embedding, nn.LSTM, nn.Linear.<br>
Q(b): Implement the forward function.
    Decide where to apply dropout. 
    The sequences in the batch have different lengths. Write/call a function to pad the sequences into the same length. 
    Apply a fully-connected (fc) layer to the output of the LSTM layer. 
    Return the output features which is of size [batch size, output dim]. 

In [10]:
def init_weights(m):
    if isinstance(m, nn.Embedding):
        nn.init.xavier_normal_(m.weight)
    elif isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM) or isinstance(m, nn.GRU):
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.zeros_(param)
            elif 'weight' in name:
                nn.init.orthogonal_(param)
                
class LSTM(nn.Module):
    def __init__(
        self, 
        vocab_size: int, 
        embedding_dim: int, 
        hidden_dim: int, 
        output_dim: int, 
        n_layers: int, 
        dropout_rate: float, 
        pad_index: int,
        bidirectional: bool,
        **kwargs):
        """
        Create a LSTM model for classification.
        :param vocab_size: size of the vocabulary
        :param embedding_dim: dimension of embeddings
        :param hidden_dim: dimension of hidden features
        :param output_dim: dimension of the output layer which equals to the number of labels.
        :param n_layers: number of layers.
        :param dropout_rate: dropout rate.
        :param pad_index: index of the padding token.we
        """
        super().__init__()
        # Add your code here. Initializing each layer by the given arguments.
        
        self.word_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_index)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            num_layers = n_layers, dropout = dropout_rate, bidirectional = bidirectional)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout()
        # Weight initialization. DO NOT CHANGE!
        if "weight_init_fn" not in kwargs:
            self.apply(init_weights)
        else:
            self.apply(kwargs["weight_init_fn"])


    def forward(self, ids:torch.Tensor, length:torch.Tensor):
        """
        Feed the given token ids to the model.
        :param ids: [batch size, seq len] batch of token ids.
        :param length: [batch size] batch of length of the token ids.
        :return: prediction of size [batch size, output dim].
        """
        # Add your code here.
        embeds = self.word_embedding(ids)
        embed_padding = nn.utils.rnn.pack_padded_sequence(embeds, length, batch_first = True, enforce_sorted = False)
        out, (h, c) = self.lstm(embed_padding)
        lstm_out = h[-1]
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # out = self.sigmoid(out)
        
        # out = out.view(out.size(0), -1)
        # out = out[:,-1]
        
        prediction = out
        
        return prediction

In [11]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def train(dataloader, model, criterion, optimizer, scheduler, device):
    model.train()
    epoch_losses = []
    epoch_accs = []

    for batch in tqdm.tqdm(dataloader, desc='training...', file=sys.stdout):
        ids = batch['ids'].to(device)
        length = batch['length']
        label = batch['label'].to(device)
        prediction = model(ids, length)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())
        scheduler.step()

    return epoch_losses, epoch_accs

def evaluate(dataloader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_accs = []

    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc='evaluating...', file=sys.stdout):
            ids = batch['ids'].to(device)
            length = batch['length']
            label = batch['label'].to(device)
            prediction = model(ids, length)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

def predict_sentiment(text, model, vocab, device):
    tokens = tokenize(vocab, text)
    ids = [vocab[t] if t in vocab else UNK_INDEX for t in tokens]
    length = torch.LongTensor([len(ids)])
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
    prediction = model(tensor, length).squeeze(dim=0)
    probability = torch.softmax(prediction, dim=-1)
    predicted_class = prediction.argmax(dim=-1).item()
    predicted_probability = probability[predicted_class].item()
    return predicted_class, predicted_probability

### Lab 1 (g) Implement GRU.

In [17]:
class GRU(nn.Module):
    def __init__(
        self, 
        vocab_size: int, 
        embedding_dim: int, 
        hidden_dim: int, 
        output_dim: int, 
        n_layers: int, 
        dropout_rate: float, 
        pad_index: int,
        bidirectional: bool,
        **kwargs):
        """
        Create a LSTM model for classification.
        :param vocab_size: size of the vocabulary
        :param embedding_dim: dimension of embeddings
        :param hidden_dim: dimension of hidden features
        :param output_dim: dimension of the output layer which equals to the number of labels.
        :param n_layers: number of layers.
        :param dropout_rate: dropout rate.
        :param pad_index: index of the padding token.we
        """
        super().__init__()
        # Add your code here. Initializing each layer by the given arguments.
        self.word_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_index)
        self.gru = nn.GRU(embedding_dim, hidden_dim,
                            num_layers = n_layers, dropout = dropout_rate, bidirectional = bidirectional)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout()

        # Weight Initialization. DO NOT CHANGE!
        if "weight_init_fn" not in kwargs:
            self.apply(init_weights)
        else:
            self.apply(kwargs["weight_init_fn"])


    def forward(self, ids:torch.Tensor, length:torch.Tensor):
        """
        Feed the given token ids to the model.
        :param ids: [batch size, seq len] batch of token ids.
        :param length: [batch size] batch of length of the token ids.
        :return: prediction of size [batch size, output dim].
        """
        # Add your code here.
        embeds = self.word_embedding(ids)
        embed_padding = nn.utils.rnn.pack_padded_sequence(embeds, length, batch_first = True, enforce_sorted = False)
        out, h = self.gru(embed_padding)
        gru_out = h[-1]
        
        out = self.dropout(gru_out)
        out = self.fc(out)
        # out = self.sigmoid(out)
        
        # out = out.view(out.size(0), -1)
        # out = out[:,-1]
        
        prediction = out
        
        return prediction

### Learning rate warmup. DO NOT TOUCH!

In [13]:
class ConstantWithWarmup(torch.optim.lr_scheduler._LRScheduler):
    def __init__(
        self,
        optimizer,
        num_warmup_steps: int,
    ):
        self.num_warmup_steps = num_warmup_steps
        super().__init__(optimizer)

    def get_lr(self):
        if self._step_count <= self.num_warmup_steps:
            # warmup
            scale = 1.0 - (self.num_warmup_steps - self._step_count) / self.num_warmup_steps
            lr = [base_lr * scale for base_lr in self.base_lrs]
            self.last_lr = lr
        else:
            lr = self.base_lrs
        return lr

### Implement the training / validation iteration here.

In [14]:
def train_and_test_model_with_hparams(hparams, model_type="lstm", **kwargs):
    # Seeding. DO NOT TOUCH! DO NOT TOUCH hparams.SEED!
    # Set the random seeds.
    CHECKPOINT_FOLDER = "./saved_model"
    torch.manual_seed(hparams.SEED)
    random.seed(hparams.SEED)
    np.random.seed(hparams.SEED)

    x_train, x_valid, x_test, y_train, y_valid, y_test = load_imdb()
    vocab = build_vocab(x_train, hparams=hparams)
    vocab_size = len(vocab)
    print(f'Length of vocabulary is {vocab_size}')

    train_data = IMDB(x_train, y_train, vocab, hparams.MAX_LENGTH)
    valid_data = IMDB(x_valid, y_valid, vocab, hparams.MAX_LENGTH)
    test_data = IMDB(x_test, y_test, vocab, hparams.MAX_LENGTH)

    collate = functools.partial(collate_fn, pad_index=hparams.PAD_INDEX)

    train_dataloader = torch.utils.data.DataLoader(
        train_data, batch_size=hparams.BATCH_SIZE, collate_fn=collate, shuffle=True)
    valid_dataloader = torch.utils.data.DataLoader(
        valid_data, batch_size=hparams.BATCH_SIZE, collate_fn=collate)
    test_dataloader = torch.utils.data.DataLoader(
        test_data, batch_size=hparams.BATCH_SIZE, collate_fn=collate)
    
    # Model
    if "override_models_with_gru" in kwargs and kwargs["override_models_with_gru"]:
        model = GRU(
            vocab_size, 
            hparams.EMBEDDING_DIM, 
            hparams.HIDDEN_DIM, 
            hparams.OUTPUT_DIM,
            hparams.N_LAYERS,
            hparams.DROPOUT_RATE, 
            hparams.PAD_INDEX,
            hparams.BIDIRECTIONAL,
            **kwargs)
    else:
        model = LSTM(
            vocab_size, 
            hparams.EMBEDDING_DIM, 
            hparams.HIDDEN_DIM, 
            hparams.OUTPUT_DIM,
            hparams.N_LAYERS,
            hparams.DROPOUT_RATE, 
            hparams.PAD_INDEX,
            hparams.BIDIRECTIONAL,
            **kwargs)
    num_params = count_parameters(model)
    print(f'The model has {num_params:,} trainable parameters')


    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    # Optimization. Lab 2 (a)(b) should choose one of them.
    # DO NOT TOUCH optimizer-specific hyperparameters! (e.g., eps, momentum)
    # DO NOT change optimizer implementations!
    if hparams.OPTIM == "sgd":
        optimizer = optim.SGD(
            model.parameters(), lr=hparams.LR, weight_decay=hparams.WD, momentum=.9)        
    elif hparams.OPTIM == "adagrad":
        optimizer = optim.Adagrad(
            model.parameters(), lr=hparams.LR, weight_decay=hparams.WD, eps=1e-6)
    elif hparams.OPTIM == "adam":
        optimizer = optim.Adam(
            model.parameters(), lr=hparams.LR, weight_decay=hparams.WD, eps=1e-6)
    elif hparams.OPTIM == "rmsprop":
        optimizer = optim.RMSprop(
            model.parameters(), lr=hparams.LR, weight_decay=hparams.WD, eps=1e-6, momentum=.9)
    else:
        raise NotImplementedError("Optimizer not implemented!")

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)

    # Start training
    best_valid_loss = float('inf')
    train_losses = []
    train_accs = []
    valid_losses = []
    valid_accs = [] 

    # Warmup Scheduler. DO NOT TOUCH!
    WARMUP_STEPS = 200
    lr_scheduler = ConstantWithWarmup(optimizer, WARMUP_STEPS)

    for epoch in range(hparams.N_EPOCHS):
        
        # Your code: implement the training process and save the best model.
        # epoch_losses, epoch_accs = train(train_dataloader, model, criterion, optimizer, lr_scheduler, device)
        # temp_loss, temp_acc = train(train_dataloader, model, criterion, optimizer, lr_scheduler, device)
        # train_losses.append(temp_loss)
        # train_accs.append(temp_acc)
        # print(train_losses, train_accs)
        # temp_loss, temp_acc = evaluate(valid_dataloader, model, criterion, device)
        # valid_losses.append(temp_loss)
        # valid_accs.append(temp_acc)
        train_losses, train_accs = train(train_dataloader, model, criterion, optimizer, lr_scheduler, device)
        valid_losses, valid_accs = evaluate(valid_dataloader, model, criterion, device)
        
        epoch_train_loss = np.mean(train_losses)
        epoch_train_acc = np.mean(train_accs)
        epoch_valid_loss = np.mean(valid_losses)
        epoch_valid_acc = np.mean(valid_accs)

        # Save the model that achieves the smallest validation loss.
        if epoch_valid_loss < best_valid_loss:
            # Your code: save the best model somewhere (no need to submit it to Sakai)
          best_valid_loss = epoch_valid_loss
          if not os.path.exists(CHECKPOINT_FOLDER):
            print("making folder")
            os.makedirs(CHECKPOINT_FOLDER)
          print("Saving ...")
          state = {'state_dict': model.state_dict(),
                  'epoch': epoch,
                  'lr': hparams.LR}
          torch.save(state, os.path.join(CHECKPOINT_FOLDER, f'{model_type}.pth'))
            # pass

        print(f'epoch: {epoch+1}')
        print(f'train_loss: {epoch_train_loss:.3f}, train_acc: {epoch_train_acc:.3f}')
        print(f'valid_loss: {epoch_valid_loss:.3f}, valid_acc: {epoch_valid_acc:.3f}')


    # Your Code: Load the best model's weights.
    load_model = os.path.join(CHECKPOINT_FOLDER, f'{model_type}.pth')
    state_dict = torch.load(load_model) # change the path to your own checkpoint file
    model.load_state_dict(state_dict['state_dict'])
    model.cuda()
    # model = 

    # Your Code: evaluate test loss on testing dataset (NOT Validation)

    test_loss, test_acc = [], []
    total, correct = 0, 0
    # with torch.no_grad():
    #   for batch_idx, (input, target) in enumerate(test_dataloader):
    #     input = input.type(torch.cuda.FloatTensor)
    #     output = model(input)
    #     output = output.to(device)

    #     loss = criterion(output, target.to(device))
    #     test_losses += loss
        
    #     prediction = torch.argmax(output.data, axis = 1)

    with torch.no_grad():
        for batch in tqdm.tqdm(test_dataloader, desc='testing...', file=sys.stdout):
            ids = batch['ids'].to(device)
            length = batch['length']
            label = batch['label'].to(device)
            prediction = model(ids, length)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            test_loss.append(loss.item())
            test_acc.append(accuracy.item())


    # get_accuracy(prediction, label):

    # test_loss, test_acc = test_losses, get_accuracy(prediction, target)

    epoch_test_loss = np.mean(test_loss)
    epoch_test_acc = np.mean(test_acc)
    print(f'test_loss: {epoch_test_loss:.3f}, test_acc: {epoch_test_acc:.3f}')
    
    # Free memory for later usage.
    del model
    torch.cuda.empty_cache()
    return {
        'num_params': num_params,
        "test_loss": epoch_test_loss,
        "test_acc": epoch_test_acc,
    }

### Lab 1 (f): Train model with original hyperparameters, for LSTM.

Train the model with default hyperparameter settings.

In [15]:
org_hyperparams = HyperParams()
_ = train_and_test_model_with_hparams(org_hyperparams, "lstm_1layer_base_sgd_e32_h100")

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60794
The model has 102,196 trainable parameters
training...: 100%|██████████| 365/365 [00:08<00:00, 43.10it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 88.88it/s]
making folder
Saving ...
epoch: 1
train_loss: 0.693, train_acc: 0.498
valid_loss: 0.693, valid_acc: 0.497
training...: 100%|██████████| 365/365 [00:07<00:00, 51.94it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 89.54it/s]
Saving ...
epoch: 2
train_loss: 0.693, train_acc: 0.498
valid_loss: 0.693, valid_acc: 0.520
training...: 100%|██████████| 365/365 [00:07<00:00, 51.50it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 87.90it/s]
Saving ...
epoch: 3
train_loss: 0.694, train_acc: 0.498
valid_loss: 0.693, valid_acc: 0.503
training...: 100%|██████████| 365/365 [00:07<00:00, 51.80it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 91.25it/s]
epoch: 4
train_loss: 0.694, train_acc: 

### Lab 1 (h) Train GRU with vanilla hyperparameters.

In [18]:
org_hyperparams = HyperParams()
_ = train_and_test_model_with_hparams(org_hyperparams, "gru_1layer_base_sgd_e32_h100", override_models_with_gru=True)

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60794
The model has 91,896 trainable parameters
training...: 100%|██████████| 365/365 [00:07<00:00, 51.76it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 85.95it/s]
Saving ...
epoch: 1
train_loss: 0.693, train_acc: 0.500
valid_loss: 0.693, valid_acc: 0.503
training...: 100%|██████████| 365/365 [00:07<00:00, 52.07it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 88.93it/s]
epoch: 2
train_loss: 0.694, train_acc: 0.499
valid_loss: 0.693, valid_acc: 0.503
training...: 100%|██████████| 365/365 [00:06<00:00, 52.94it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 90.97it/s]
Saving ...
epoch: 3
train_loss: 0.694, train_acc: 0.503
valid_loss: 0.693, valid_acc: 0.502
training...: 100%|██████████| 365/365 [00:06<00:00, 52.92it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 87.29it/s]
epoch: 4
train_loss: 0.694, train_acc: 0.500
valid_loss: 0.693, v

### Lab 2 (a) Study of LSTM Optimizers. Hint: For adaptive optimizers, we recommend using a learning rate of 0.001 (instead of 0.01).

In [26]:
h = HyperParams()
h.LR = 0.001
h_list = ["sgd", "adagrad", "adam", "rmsprop"]
test_acc = []
for i in h_list:
  print("training with optimizer: " + i)
  print("\n---------------------------------------\n")
  h.OPTIM = i
  name = "lstm_1layer_base_" + i + "_e32_h100"
  _ = train_and_test_model_with_hparams(h, name)
  test_acc.append(_.get("test_acc"))
print(test_acc)

training with optimizer: sgd

---------------------------------------

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60794
The model has 102,196 trainable parameters
training...: 100%|██████████| 365/365 [00:07<00:00, 47.97it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 81.45it/s]
Saving ...
epoch: 1
train_loss: 0.693, train_acc: 0.499
valid_loss: 0.693, valid_acc: 0.497
training...: 100%|██████████| 365/365 [00:07<00:00, 49.63it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 86.15it/s]
Saving ...
epoch: 2
train_loss: 0.693, train_acc: 0.498
valid_loss: 0.693, valid_acc: 0.503
training...: 100%|██████████| 365/365 [00:07<00:00, 48.97it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 85.98it/s]
epoch: 3
train_loss: 0.693, train_acc: 0.500
valid_loss: 0.693, valid_acc: 0.497
training...: 100%|██████████| 365/365 [00:07<00:00, 50.10it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 87.1

### Lab 2 (b): Study of GRU Optimizers. Hint: For adaptive optimizers, we recommend using a learning rate of 0.001 (instead of 0.01).

In [27]:
h = HyperParams()
h.LR = 0.001
h_list = ["sgd", "adagrad", "adam", "rmsprop"]
test_acc = []
for i in h_list:
  print("training with optimizer: " + i)
  print("\n---------------------------------------\n")
  h.OPTIM = i
  name = "gru_1layer_base_" + i + "_e32_h100"
  _ = train_and_test_model_with_hparams(h, name, override_models_with_gru=True)
  test_acc.append(_.get("test_acc"))
print(test_acc)

training with optimizer: sgd

---------------------------------------

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60794
The model has 91,896 trainable parameters
training...: 100%|██████████| 365/365 [00:07<00:00, 50.37it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 84.30it/s]
Saving ...
epoch: 1
train_loss: 0.693, train_acc: 0.501
valid_loss: 0.693, valid_acc: 0.503
training...: 100%|██████████| 365/365 [00:07<00:00, 51.81it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 85.73it/s]
epoch: 2
train_loss: 0.693, train_acc: 0.499
valid_loss: 0.693, valid_acc: 0.503
training...: 100%|██████████| 365/365 [00:07<00:00, 51.38it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 84.18it/s]
epoch: 3
train_loss: 0.693, train_acc: 0.499
valid_loss: 0.693, valid_acc: 0.503
training...: 100%|██████████| 365/365 [00:07<00:00, 50.91it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 87.07it/s]
epoch

### Lab 2 (c) Deeper LSTMs

In [31]:
# N_LAYERS
h = HyperParams()
h.LR = 0.001
h.OPTIM = "adam"
h_list = [1, 2, 3, 4]
test_acc = []
for i in h_list:
  print("training with number of layers: " + str(i))
  print("\n---------------------------------------\n")
  h.N_LAYERS = i
  name = "lstm_1layer_base_" + str(i) + "_e32_h100"
  _ = train_and_test_model_with_hparams(h, name)
  test_acc.append(_.get("test_acc"))
print(test_acc)

training with number of layers: 1

---------------------------------------

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60794
The model has 102,196 trainable parameters
training...: 100%|██████████| 365/365 [00:07<00:00, 48.72it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 85.10it/s]
Saving ...
epoch: 1
train_loss: 0.697, train_acc: 0.558
valid_loss: 0.654, valid_acc: 0.742
training...: 100%|██████████| 365/365 [00:07<00:00, 49.15it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 87.96it/s]
Saving ...
epoch: 2
train_loss: 0.492, train_acc: 0.772
valid_loss: 0.305, valid_acc: 0.873
training...: 100%|██████████| 365/365 [00:07<00:00, 49.41it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 88.93it/s]
Saving ...
epoch: 3
train_loss: 0.215, train_acc: 0.919
valid_loss: 0.287, valid_acc: 0.885
training...: 100%|██████████| 365/365 [00:07<00:00, 49.61it/s]
evaluating...: 100%|██████████| 53/53 [0

### Lab 2 (d) Wider LSTMs

In [32]:
# Hidden_dim
h = HyperParams()
h.LR = 0.001
h.N_LAYERS = 2
h.OPTIM = "adam"
h_list = [100, 150, 200, 250, 300]
test_acc = []
for i in h_list:
  print("training with hidden_dimension: " + str(i))
  print("\n---------------------------------------\n")
  h.HIDDEN_DIM = i
  name = "lstm_1layer_base_" + str(i) + "_e32_h100"
  _ = train_and_test_model_with_hparams(h, name)
  test_acc.append(_.get("test_acc"))
print(test_acc)

training with hidden_dimension: 100

---------------------------------------

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60794
The model has 182,996 trainable parameters
training...: 100%|██████████| 365/365 [00:10<00:00, 33.73it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 71.64it/s]
Saving ...
epoch: 1
train_loss: 0.678, train_acc: 0.564
valid_loss: 0.541, valid_acc: 0.789
training...: 100%|██████████| 365/365 [00:10<00:00, 34.89it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 71.53it/s]
Saving ...
epoch: 2
train_loss: 0.317, train_acc: 0.869
valid_loss: 0.288, valid_acc: 0.891
training...: 100%|██████████| 365/365 [00:10<00:00, 35.20it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 72.31it/s]
epoch: 3
train_loss: 0.149, train_acc: 0.947
valid_loss: 0.299, valid_acc: 0.888
training...: 100%|██████████| 365/365 [00:10<00:00, 34.42it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:0

### Lab 2 (e) Larger Embedding Table

In [34]:
# EMBEDDING_DIM
h = HyperParams()
h.LR = 0.001
h.N_LAYERS = 2
h.OPTIM = "adam"
h.HIDDEN_DIM = 200
h_list = [1, 16, 64, 128, 256]
test_acc = []
for i in h_list:
  print("training with embedding_dim: " + str(i))
  print("\n---------------------------------------\n")
  h.EMBEDDING_DIM = i
  name = "lstm_1layer_base_" + str(i) + "_e32_h100"
  _ = train_and_test_model_with_hparams(h, name)
  test_acc.append(_.get("test_acc"))
print(test_acc) 

training with embedding_dim: 1

---------------------------------------

shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60794
The model has 545,196 trainable parameters
training...: 100%|██████████| 365/365 [00:12<00:00, 28.45it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 61.96it/s]
Saving ...
epoch: 1
train_loss: 0.630, train_acc: 0.619
valid_loss: 0.640, valid_acc: 0.719
training...: 100%|██████████| 365/365 [00:12<00:00, 28.43it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 60.94it/s]
Saving ...
epoch: 2
train_loss: 0.270, train_acc: 0.893
valid_loss: 0.284, valid_acc: 0.888
training...: 100%|██████████| 365/365 [00:12<00:00, 28.53it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 62.51it/s]
epoch: 3
train_loss: 0.139, train_acc: 0.952
valid_loss: 0.316, valid_acc: 0.891
training...: 100%|██████████| 365/365 [00:12<00:00, 28.63it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 63

### Lab 2(f) Compound scaling of embedding_dim, hidden_dim, layers

In [37]:
from nltk.metrics.distance import jaro_similarity
h = HyperParams()
h.LR = 0.001
h.OPTIM = "adam"
h.N_LAYERS = 1
hidden_dim_list = [100, 150, 200]
embed_dim_list = [1, 16, 64]
test_acc = []
res = 0
for k in hidden_dim_list:
  for j in embed_dim_list:
    h.HIDDEN_DIM = k
    h.EMBEDDING_DIM = j
    print("\n---------------------------------------")
    print("training with hidden_dim: " + str(k) + ", embed_dim: " + str(j))

    name = "lstm_1layer_base_" + str(k) + "_" + str(j) + "_e32_h100"
    _ = train_and_test_model_with_hparams(h, name)
    test_acc.append(_.get("test_acc"))
    max_acc = max(test_acc)
    if res != max_acc:
      res = max_acc
      param = {"layer": i, "hidden" : k, "embed": j}
print(max_acc)
print(param)


---------------------------------------
training with hidden_dim: 100, embed_dim: 1
shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60794
The model has 102,196 trainable parameters
training...: 100%|██████████| 365/365 [00:07<00:00, 46.32it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 82.84it/s]
Saving ...
epoch: 1
train_loss: 0.697, train_acc: 0.558
valid_loss: 0.654, valid_acc: 0.742
training...: 100%|██████████| 365/365 [00:08<00:00, 44.38it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 84.43it/s]
Saving ...
epoch: 2
train_loss: 0.492, train_acc: 0.772
valid_loss: 0.305, valid_acc: 0.873
training...: 100%|██████████| 365/365 [00:07<00:00, 48.77it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 87.59it/s]
Saving ...
epoch: 3
train_loss: 0.215, train_acc: 0.919
valid_loss: 0.287, valid_acc: 0.885
training...: 100%|██████████| 365/365 [00:08<00:00, 41.00it/s]
evaluating...: 100%|██████████|

### Lab 2 (g) Bi-Directional LSTM, using best architecture from (f)

In [38]:
h = HyperParams()
h.LR = 0.001
h.OPTIM = "adam"
h.N_LAYERS = 1
h.HIDDEN_DIM = 150
h.EMBEDDING_DIM = 1
h.BIDIRECTIONAL = True
name = "lstm_1layer_base_" + str(k) + "_" + str(j) + "_e32_h100"
_ = train_and_test_model_with_hparams(h, name)


shape of train data is (35000,)
shape of test data is (10000,)
shape of valid data is (5000,)
Length of vocabulary is 60794
The model has 244,696 trainable parameters
training...: 100%|██████████| 365/365 [00:11<00:00, 32.33it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 63.42it/s]
Saving ...
epoch: 1
train_loss: 0.706, train_acc: 0.526
valid_loss: 0.680, valid_acc: 0.615
training...: 100%|██████████| 365/365 [00:11<00:00, 31.88it/s]
evaluating...: 100%|██████████| 53/53 [00:00<00:00, 60.02it/s]
Saving ...
epoch: 2
train_loss: 0.550, train_acc: 0.745
valid_loss: 0.385, valid_acc: 0.850
training...: 100%|██████████| 365/365 [00:11<00:00, 32.22it/s]
evaluating...: 100%|██████████| 53/53 [00:01<00:00, 38.58it/s]
Saving ...
epoch: 3
train_loss: 0.280, train_acc: 0.893
valid_loss: 0.339, valid_acc: 0.860
training...: 100%|██████████| 365/365 [00:12<00:00, 29.92it/s]
evaluating...: 100%|██████████| 53/53 [00:01<00:00, 41.47it/s]
Saving ...
epoch: 4
train_loss: 0.165, train_acc: 0.9