In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Sentiment Classification Using RNNs

* Given the IMDB Movie Review Dataset, create an RNN model that predicts whether the given review is negative or positive.
* You need to create your Dataset, Dataloader and Model. Keep your code modular and avoid hardcoding any parameter. This will allow you to experiment more easily.
* Plot graphs for loss and accuracy for each epoch of a training loop. Try using wandb for logging training and validation losses, accuracies (especially for hyperparameter tuning).
* Use tqdm to keep track of the status of the training loop for an epoch.

### 1. RNN Model
#### 1.1 Build a Dataset from the IMDB Movie Review Dataset by taking reviews with word count from 100 to 500. Perform text processing on the movie reviews and create a word to index mapping for representing any review as a list of numbers.


In [None]:
# !pip install datasets torchmetrics

In [None]:
# import numpy as np
# import matplotlib.pyplot as plt

# from datasets import load_dataset
# import re
# from nltk.tokenize import RegexpTokenizer
# from nltk.stem import WordNetLemmatizer

# import torch
# from torch import nn
# import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader
# from torch.nn.utils.rnn import pad_sequence
# from torchmetrics import Accuracy

# from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import re
import nltk, subprocess
import random


from sklearn.datasets import load_files

# Load the IMDB dataset
!pip install datasets
from datasets import load_dataset
# load the IMDB review dataset. You can take the dataset from Huggingface
imdb_dataset = load_dataset("imdb")




Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
SEED = 1234

# set seed for all possible random functions to ensure reproducibility
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic=True

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [None]:
# Split the train set into train and validation in 80-20 split. Use the labels
# to ensure that the ratio of the samples from each label is maintained
# Filter reviews by word count between 100 and 500
def filter_reviews_by_word_count(dataset, min_word_count=100, max_word_count=500):
    filtered_data = []
    for example in dataset:
        text = example["text"]
        word_count = len(text.split())
        if min_word_count <= word_count <= max_word_count:
            filtered_data.append(example)
    return filtered_data

# Filter reviews by word count
train_data = filter_reviews_by_word_count(imdb_dataset["train"])
test_filtered = filter_reviews_by_word_count(imdb_dataset["test"])

# Determine the size of the validation set
val_size = int(0.2 * len(test_filtered))
test_size = len(test_filtered) - val_size

# Randomly select reviews for validation
val_indices = random.sample(range(len(test_filtered)), val_size)
val_data = [test_filtered[i] for i in val_indices]

# Assign the remaining reviews to test_data
test_data = [review for i, review in enumerate(test_filtered) if i not in val_indices]

# Print the sizes of val_data and test_data
print("Size of train_data:", len(train_data))
print("Size of val_data:", len(val_data))
print("Size of test_data:", len(test_data))

Size of train_data: 20056
Size of val_data: 4004
Size of test_data: 16020


In [None]:
# Shuffle the data
np.random.shuffle(train_data)
np.random.shuffle(test_data)
# np.random.shuffle(val_data)

# Optionally, limit the size of the datasets
# test_data = test_data[:5000]      # Limiting to 5,000 samples for faster compute

print("Number of samples in train_data:", len(train_data))
print("Number of samples in test_data:", len(test_data))
print("Number of samples in val_data:", len(val_data))


# Function to find the longest sentence
def find_longest_sentence(dataset):
    longest_sentence = ""
    max_length = 0
    for example in dataset:
        text = example["text"]
        if len(text.split()) > max_length:
            longest_sentence = text
            max_length = len(text.split())
    return longest_sentence, max_length  # Return both the longest sentence and its length

# Find longest sentence in each dataset
longest_train_sentence, train_max_length = find_longest_sentence(train_data)
longest_test_sentence, test_max_length = find_longest_sentence(test_data)
longest_val_sentence, val_max_length = find_longest_sentence(val_data)

# Print the longest sentence and its length
print("Longest sentence in train_data:")
print("Number of words:", train_max_length)

print("\nLongest sentence in test_data:")
print("Number of words:", test_max_length)

print("\nLongest sentence in val_data:")
print("Number of words:", val_max_length)


Number of samples in train_data: 20056
Number of samples in test_data: 16020
Number of samples in val_data: 4004
Longest sentence in train_data:
Number of words: 500

Longest sentence in test_data:
Number of words: 500

Longest sentence in val_data:
Number of words: 500


In [None]:
# def clean(text, tokenizer):
#   # Perform text preprocessing:
#   # 1. Removing numbers OR replace them with "num" token
#   # 2. Convert all characters to lowercase.
#   # 3. Tokenize the sentence into words
#   # You can use RegexpTokenizer from NLTK.

#   # You will experiment with stemming/lemmatization down the line
#   # so you can skip that for now

#   return text

In [None]:
# clean("This IS 1 example sentence", RegexpTokenizer(r'\w+'))

In [None]:
nltk.download('punkt')
try:
    nltk.data.find('punkt.zip')
except:
    nltk.download('punkt', download_dir='/kaggle/working/')
    subprocess.run("unzip /usr/share/nltk_data/corpora/punkt.zip -d /usr/share/nltk_data/corpora/".split())
    nltk.data.path.append('/kaggle/input')


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /kaggle/working/...
[nltk_data]   Unzipping tokenizers/punkt.zip.


unzip:  cannot find or open /usr/share/nltk_data/corpora/punkt.zip, /usr/share/nltk_data/corpora/punkt.zip.zip or /usr/share/nltk_data/corpora/punkt.zip.ZIP.


In [None]:
nltk.download('stopwords')
try:
    nltk.data.find('stopwords.zip')
except:
    nltk.download('stopwords', download_dir='/kaggle/working/')
    subprocess.run("unzip /usr/share/nltk_data/corpora/stopwords.zip -d /usr/share/nltk_data/corpora/".split())
    nltk.data.path.append('/kaggle/input')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /kaggle/working/...
[nltk_data]   Unzipping corpora/stopwords.zip.
Archive:  /usr/share/nltk_data/corpora/stopwords.zip


replace /usr/share/nltk_data/corpora/stopwords/dutch? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


In [None]:
nltk.download('wordnet')
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    subprocess.run("unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/".split())
    nltk.data.path.append('/kaggle/input')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /kaggle/working/...
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/

In [None]:
# # Text preprocessing
# def preprocess_text(text):
#     text = text.lower()  # Convert to lowercase
#     text = re.sub(r'\W', ' ', text)  # Remove non-word characters
#     text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
#     return text

# # Tokenization and lemmatization
# lemmatizer = WordNetLemmatizer()
# stop_words = set(stopwords.words('english'))

# def tokenize_and_lemmatize(text):
#     tokens = word_tokenize(text)
#     tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
#     return tokens

In [None]:
# # create a word to index dictionary so that each word in the training set
# # has a number associated with it. This allows to represent each sentence
# # as a series of numbers. Start the index with 1 instead of 0. The number
# # 0 will be used to denote padding, so that each sentence can have the
# # same length.
# # Keep track of the index since it will be used for representing new words
# # that were not part of the training vocabulary.
# # Also, make sure to not create dictionary on sentences with word count
# # not within the range

# def get_word2idx(corpus):
#   idx = 1
#   for sentence in tqdm(corpus, total=len(corpus), desc="Creating word2idx"):
#     # process sentence
#     sentence = clean(sentence, tokenizer)

#     # drop sentences greater than maxlen or less than minlen

#     # for each word in sentence, check for entry in word2idx

#   return idx, word2idx

In [None]:
# Text preprocessing
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
    return text

def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

# Build vocabulary
def build_vocab(data):
    word_to_idx = {}
    idx_to_word = {}
    for review in data:
        tokens = tokenize_and_lemmatize(preprocess_text(review['text']))
        for token in tokens:
            if token not in word_to_idx:
                idx = len(word_to_idx)
                word_to_idx[token] = idx
                idx_to_word[idx] = token
    return word_to_idx, idx_to_word

# Tokenization and lemmatization
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Modify the code according to train, test, val datasets
word_to_idx_train, _ = build_vocab(train_data)

# Find a unique index for UNK
max_idx = max(word_to_idx_train.values()) if word_to_idx_train else -1
unk_idx = max_idx + 1

# Add UNK to word_to_idx_train
word_to_idx_train['UNK'] = unk_idx

#### 1.2 Create Dataloaders for the train, test and validation datasets with appropriate batch sizes.


In [None]:
# # Build a Dataset object to store each sentence as a tensor of numbers
# # along with the label. Make sure to add padding so that the tensor
# # for each sentence is of the same length. This will allow us to train
# # the model in batches.

# class IMDBDataset(Dataset):
#   def __init__(self, dataset, split : str, minlen : int = 100, maxlen : int = 500):
#     self.count = 0 # total sentences you finally pick

#     # count total number of lines
#     len = len(dataset[split])

#     input_data = []
#     target_data = []

#     for idx, sentence in tqdm(enumerate(corpus), total=len, desc=f"Transforming input text [{split}]"):
#       # process sentence

#       # drop sentences greater than maxlen or less than minlen

#       # replace words with their index


#       self.count += 1

#     # pad the sentences upto maxlen
#     self.inputs = pad_sequence(input_data, batch_first = True)
#     self.targets = torch.tensor(target_data)

#   def __len__(self) -> int:
#     return self.count

#   def __getitem__(self, index : int):
#     return self.inputs[index], self.targets[index]

In [None]:
# # create the train dataset using the word2idx dictionary built using the train set
# train_ds = IMDBDataset(imdb_dataset, "train",minlen = 100, maxlen = 500)
# # create the validation and test dataset using the word2idx dictionary built using the train set



In [None]:
# len(train_ds), len(val_ds), len(test_ds)

In [None]:
# # create dataloaders using the dataset
# params = {
#     'batch_size':32,
#     'shuffle': True,
#     'num_workers': 2
# }

# train_dataloader = DataLoader(train_ds, **params)
# test_dataloader = DataLoader(val_ds, **params)
# test_dataloader = DataLoader(test_ds, **params)

In [None]:
class IMDBDataset(Dataset):
    def __init__(self, data, word_to_idx, max_length=500):
        self.data = data
        self.word_to_idx = word_to_idx
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        review = self.data[index]['text']
        label = int(self.data[index]['label'])  # Convert label to integer
        tokens = tokenize_and_lemmatize(preprocess_text(review))
        indexed_tokens = [self.word_to_idx.get(token, max_idx + 1) for token in tokens]  # Use .get() to handle missing keys
        indexed_tokens = indexed_tokens[:self.max_length]  # Trim to max length
        padded_tokens = indexed_tokens + [0] * (self.max_length - len(indexed_tokens))  # Pad sequence
        return torch.tensor(padded_tokens), torch.tensor(label)


In [None]:
# Create DataLoader
def collate_fn(batch):
    inputs = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)
    return padded_inputs, labels


In [None]:
# Function to create dataset and data loader
def create_dataset_and_loader(data, word_to_idx, batch_size, shuffle):
    dataset = IMDBDataset(data, word_to_idx)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)
    return dataset, loader

In [None]:
batch_size = 32

train_dataset, train_loader = create_dataset_and_loader(train_data, word_to_idx_train, batch_size, shuffle=True)
val_dataset, val_loader = create_dataset_and_loader(val_data, word_to_idx_train, batch_size, shuffle=False)
test_dataset, test_loader = create_dataset_and_loader(test_data, word_to_idx_train, batch_size, shuffle=False)

# Print the lengths of datasets
print("Length of train_dataset:", len(train_dataset))
print("Length of val_dataset:", len(val_dataset))
print("Length of test_dataset:", len(test_dataset))

Length of train_dataset: 20056
Length of val_dataset: 4004
Length of test_dataset: 16020


#### 1.3 Create the Model class for the RNN Model. Create functions for running model training and testing.

In [None]:
# # create a model
# class RNNModel(nn.Module):
#   def __init__(self, vocab_size, hidden_size, embedding_dim, num_classes):
#     # call the init method of the parent

#     # define the layers


#   def forward(self, X):

#     # run foward pass through the model

#     return logits

In [None]:
# # Hyperparameters
# hidden_size = 256
# embedding_dim = 128
# learning_rate = 1e-3
# epochs = 5

# # create the model
# model = RNNModel(vocab_size, hidden_size, embedding_dim, num_classes).to(device)

# # create optimizer

# print(model)

In [None]:
# # Create a model training loop
# def train_model():

#   for epoch in range(epochs):
#     ## TRAINING STEP
#     model.train()
#     # train
#     for input_batch, output_batch in tqdm(trainloader, total = len(trainloader), desc = "Training"):

#     # Log metrics

#     ## VALIDATION STEP
#     model.eval()
#     # run validation
#     for input_batch, output_batch in tqdm(valloader, total = len(valloader), desc = "Validation"):

#     # Log metrics

#     # store best model

#   return train_losses, val_losses, val_accuracy

In [None]:
# Create a model testing loop


In [None]:
# # train the model
# train_losses, val_losses, val_accuracy = train_model()

In [None]:
# plot training and validation losses

In [None]:
# plot validation accuracy

In [None]:
# find the classification accuracy on test set


In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, output_size):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)  # Output size is 1 for binary classification

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        # Take the output from the last time step of the last layer
        last_output = output[:, -1, :]
        output = self.fc(last_output)
        return output.squeeze(1)  # Ensure output shape is [batch_size]


In [None]:
# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.float().unsqueeze(1).to(device)  # Convert labels to float and unsqueeze to add a dimension
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels.view_as(outputs))  # View labels as the same shape as outputs
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        train_loss /= len(train_loader.dataset)

        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.float().unsqueeze(1).to(device)  # Convert labels to float and unsqueeze to add a dimension
                outputs = model(inputs)
                loss = criterion(outputs, labels.view_as(outputs))  # View labels as the same shape as outputs
                val_loss += loss.item() * inputs.size(0)

                # Apply sigmoid activation for BCE
                predicted = torch.round(torch.sigmoid(outputs))

                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_loss /= len(val_loader.dataset)
        val_accuracy = correct / total

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')

# Testing function
def test_model(model, test_loader, device):
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.float().unsqueeze(1).to(device)  # Convert labels to float and unsqueeze to add a dimension
            outputs = model(inputs)
            predicted = torch.round(torch.sigmoid(outputs))  # Apply sigmoid activation and round to get predictions
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    test_accuracy = correct / total
    print(f'Test Accuracy: {test_accuracy:.4f}')

In [None]:
# Hyperparameters
input_size = len(word_to_idx_train)
embedding_size = 100
hidden_size = 128
num_layers = 1
output_size = 1  # For binary classification
learning_rate = 0.005
num_epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model, criterion, and optimizer
model2 = RNN(input_size, embedding_size, hidden_size, num_layers, output_size).to(device)
criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy with logits
optimizer = optim.Adam(model2.parameters(), lr=learning_rate)

In [None]:
# Train the model2
train_model(model2, train_loader, val_loader, criterion, optimizer, num_epochs, device)

Epoch 1/5, Train Loss: 0.7029, Val Loss: 0.6929, Val Acc: 32.6783
Epoch 2/5, Train Loss: 0.6980, Val Loss: 0.6941, Val Acc: 32.6783
Epoch 3/5, Train Loss: 0.7010, Val Loss: 0.6954, Val Acc: 32.6783
Epoch 4/5, Train Loss: 0.6979, Val Loss: 0.7144, Val Acc: 32.6783
Epoch 5/5, Train Loss: 0.6997, Val Loss: 0.6933, Val Acc: 31.0699


In [None]:
# Test the model
test_model(model2, test_loader, device)

Test Accuracy: 31.4519


In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, output_size):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)  # Output size is 1 for binary classification

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        output_mean = torch.mean(output, dim=1)
        output = self.fc(output_mean)
        return output.squeeze(1)  # Ensure output shape is [batch_size]

In [None]:
# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.float().to(device)  # Convert labels to float
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        train_loss /= len(train_loader.dataset)

        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.float().to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)

                # Apply sigmoid activation and round to get predictions
                predicted = torch.round(torch.sigmoid(outputs))

                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_loss /= len(val_loader.dataset)
        val_accuracy = correct / total

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')


# Testing function
def test_model(model, test_loader, device):
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.float().to(device)  # Convert labels to float
            outputs = model(inputs)
            predicted = torch.round(torch.sigmoid(outputs))  # Apply sigmoid activation and round to get predictions
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    test_accuracy = correct / total
    print(f'Test Accuracy: {test_accuracy:.4f}')

In [None]:
# Hyperparameters
input_size = len(word_to_idx_train)
embedding_size = 100
hidden_size = 128
num_layers = 1
output_size = 1  # For binary classification
learning_rate = 0.005
num_epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model, criterion, and optimizer
model = RNN(input_size, embedding_size, hidden_size, num_layers, output_size).to(device)
criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy with logits
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device)

Epoch 1/5, Train Loss: 0.7034, Val Loss: 0.6913, Val Acc: 0.5122
Epoch 2/5, Train Loss: 0.6869, Val Loss: 0.6688, Val Acc: 0.5617
Epoch 3/5, Train Loss: 0.4728, Val Loss: 0.4973, Val Acc: 0.8204
Epoch 4/5, Train Loss: 0.3723, Val Loss: 0.4281, Val Acc: 0.8014
Epoch 5/5, Train Loss: 0.2626, Val Loss: 0.5176, Val Acc: 0.8319


In [None]:
# Test the model
test_model(model, test_loader, device)

Test Accuracy: 0.8297


#### 1.4 Incorporate stemming/lemmatization when doing text preprocessing using the NLTK library. What changes do you observe in accuracy ?

#### 1.5 In the Model class, experiment with only picking the last output and mean of all outputs in the RNN layer. What changes do you observe ?

### 2. Hyperparameter Tuning
#### 2.1 Starting with the best configurations based on the above experiments, experiment with 5 different hyperparameter configurations. You can change the size of embedding layer, hidden state, batch in the dataloader.


#### 2.2 Evaluate the performance of the configurations on the validation sets using metrics like accuracy and loss. Analyze the results.

In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, output_size):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)  # Output size is 1 for binary classification

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        output_mean = torch.mean(output, dim=1)
        output = self.fc(output_mean)
        return output.squeeze(1)  # Ensure output shape is [batch_size]

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Move model to GPU if available

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.float().to(device)  # Move inputs and labels to GPU
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        train_loss /= len(train_loader.dataset)

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}')


def evaluate_model(model, val_loader, criterion):
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)  # Move inputs to GPU
            labels = labels.float().to(device)  # Move labels to GPU
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
            # _, predicted = torch.max(outputs, 1)\
            predicted = torch.round(torch.sigmoid(outputs))  # Apply sigmoid activation and round to get predictions
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_loss /= len(val_loader.dataset)
    val_accuracy = correct / total

    return val_loss, val_accuracy


In [None]:
# def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device):
#     model.to(device)
#     for epoch in range(num_epochs):
#         model.train()
#         train_loss = 0.0
#         for inputs, labels in train_loader:
#             inputs, labels = inputs.to(device), labels.float().to(device)  # Convert labels to float
#             optimizer.zero_grad()
#             outputs = model(inputs)
#             loss = criterion(outputs, labels)
#             loss.backward()
#             optimizer.step()
#             train_loss += loss.item() * inputs.size(0)

#         train_loss /= len(train_loader.dataset)

#         # Validation
#         model.eval()
#         val_loss = 0.0
#         correct = 0
#         total = 0
#         with torch.no_grad():
#             for inputs, labels in val_loader:
#                 inputs, labels = inputs.to(device), labels.float().to(device)
#                 outputs = model(inputs)
#                 loss = criterion(outputs, labels)
#                 val_loss += loss.item() * inputs.size(0)

#                 # Apply sigmoid activation and round to get predictions
#                 predicted = torch.round(torch.sigmoid(outputs))

#                 total += labels.size(0)
#                 correct += (predicted == labels).sum().item()

#         val_loss /= len(val_loader.dataset)
#         val_accuracy = correct / total

#         print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')

In [None]:
# Testing function
def test_model(model, test_loader, device):
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.float().to(device)  # Move data to GPU
            outputs = model(inputs)
            # _, predicted = torch.max(outputs, 1)
            predicted = torch.round(torch.sigmoid(outputs))  # Apply sigmoid activation and round to get predictions
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    test_accuracy = correct / total
    print(f'Test Accuracy: {test_accuracy:.4f}')

In [None]:
import itertools

# Define ranges for hyperparameters
input_size = len(word_to_idx_train)
embedding_sizes = [100, 200, 300]
hidden_sizes = [128, 256, 512]
batch_sizes = [32, 64, 128]
num_layers = [1] #[1, 2]
learning_rates = [0.005] # [0.001, 0.005]

# Define other hyperparameters
output_size = 2
num_epochs = 5

best_accuracy = 0.0
best_hyperparameters = {}
best_validation_loss = float('inf')
best_model_state = None


# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Iterate over all combinations of hyperparameters
for embedding_size, hidden_size, batch_size, num_layers, learning_rate in itertools.product(embedding_sizes, hidden_sizes, batch_sizes, num_layers, learning_rates):
    # Initialize model, criterion, and optimizer
    model = RNN(input_size, embedding_size, hidden_size, num_layers, output_size).to(device)  # Move model to GPU
#     criterion = nn.CrossEntropyLoss()
    criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy with logits

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    print(f"Embedding Size: {embedding_size}, Hidden Size: {hidden_size}, Batch Size: {batch_size}, Num Layers: {num_layers}, Learning Rate: {learning_rate}")

    # Create train and validation data loaders with current batch size
    train_dataset, train_loader = create_dataset_and_loader(train_data, word_to_idx_train, batch_size, shuffle=True)
    val_dataset, val_loader = create_dataset_and_loader(val_data, word_to_idx_train, batch_size, shuffle=False)

    # Train the model
    train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs)

    # Evaluate the model on the validation set
    val_loss, val_accuracy = evaluate_model(model, val_loader, criterion)

    # Print results
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    print()
    # Update best hyperparameters if current configuration performs better
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_hyperparameters = {
            'embedding_size': embedding_size,
            'hidden_size': hidden_size,
            'batch_size': batch_size,
            'num_layers': num_layers,
            'learning_rate': learning_rate
        }
        # Save the state of the best model
        best_model_state = model.state_dict()

    # Update best validation loss if current configuration has lower validation loss
    if val_loss < best_validation_loss:
        best_validation_loss = val_loss



Embedding Size: 100, Hidden Size: 128, Batch Size: 32, Num Layers: 1, Learning Rate: 0.005
Epoch 1/5, Train Loss: 0.6914
Epoch 2/5, Train Loss: 0.5360
Epoch 3/5, Train Loss: 0.5117
Epoch 4/5, Train Loss: 0.3285
Epoch 5/5, Train Loss: 0.2614
Validation Loss: 0.3742, Validation Accuracy: 0.8429

Embedding Size: 100, Hidden Size: 128, Batch Size: 64, Num Layers: 1, Learning Rate: 0.005
Epoch 1/5, Train Loss: 0.6955
Epoch 2/5, Train Loss: 0.6582
Epoch 3/5, Train Loss: 0.6062
Epoch 4/5, Train Loss: 0.5244
Epoch 5/5, Train Loss: 0.4382
Validation Loss: 0.5000, Validation Accuracy: 0.7892

Embedding Size: 100, Hidden Size: 128, Batch Size: 128, Num Layers: 1, Learning Rate: 0.005
Epoch 1/5, Train Loss: 0.6725
Epoch 2/5, Train Loss: 0.4889
Epoch 3/5, Train Loss: 0.3225
Epoch 4/5, Train Loss: 0.2309
Epoch 5/5, Train Loss: 0.1988
Validation Loss: 0.4675, Validation Accuracy: 0.8189

Embedding Size: 100, Hidden Size: 256, Batch Size: 32, Num Layers: 1, Learning Rate: 0.005
Epoch 1/5, Train Loss: 

In [None]:
# import itertools

# # Define ranges for hyperparameters
# embedding_sizes = [200, 300]
# hidden_sizes = [128, 256]
# batch_sizes = [32, 64]
# num_layers = [1, 2]
# learning_rates = [0.001, 0.005]

# # Define other hyperparameters
# output_size = 2
# num_epochs = 5

# best_accuracy = 0.0
# best_hyperparameters = {}
# best_validation_loss = float('inf')
# best_model_state = None
# best_model_path = 'best_model.pth'  # Path to save the best model

# # Check if GPU is available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Iterate over all combinations of hyperparameters
# for embedding_size, hidden_size, batch_size, num_layers, learning_rate in itertools.product(embedding_sizes, hidden_sizes, batch_sizes, num_layers, learning_rates):
#     # Initialize model, criterion, and optimizer
#     model = RNN(input_size, embedding_size, hidden_size, num_layers, output_size).to(device)  # Move model to GPU
#     criterion = nn.CrossEntropyLoss()
#     optimizer = optim.Adam(model.parameters(), lr=learning_rate)

#     # Create train and validation data loaders with current batch size
#     train_dataset, train_loader = create_dataset_and_loader(train_data, word_to_idx_train, batch_size, shuffle=True)
#     val_dataset, val_loader = create_dataset_and_loader(val_data, word_to_idx_train, batch_size, shuffle=False)

#     # Train the model
#     train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs)

#     # Evaluate the model on the validation set
#     val_loss, val_accuracy = evaluate_model(model, val_loader, criterion)

#     # Print results
#     print(f"Embedding Size: {embedding_size}, Hidden Size: {hidden_size}, Batch Size: {batch_size}, Num Layers: {num_layers}, Learning Rate: {learning_rate}")
#     print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

#     # Update best hyperparameters if current configuration performs better
#     if val_accuracy > best_accuracy:
#         best_accuracy = val_accuracy
#         best_hyperparameters = {
#             'embedding_size': embedding_size,
#             'hidden_size': hidden_size,
#             'batch_size': batch_size,
#             'num_layers': num_layers,
#             'learning_rate': learning_rate
#         }
#         # Save the state of the best model
#         best_model_state = model.state_dict()

#     # Update best validation loss if current configuration has lower validation loss
#     if val_loss < best_validation_loss:
#         best_validation_loss = val_loss

# # Save the best model's state dictionary to a file
# if best_model_state:
#     torch.save(best_model_state, best_model_path)

# print("Best Hyperparameters:", best_hyperparameters)
# print("Best Validation Loss:", best_validation_loss)

# # Load the best model for testing
# best_model = RNN(input_size, best_hyperparameters['embedding_size'], best_hyperparameters['hidden_size'], best_hyperparameters['num_layers'], output_size).to(device)
# best_model.load_state_dict(torch.load(best_model_path))

In [None]:
best_model_path = 'best_model.pt'  # Path to save the best model
# Save the best model's state dictionary to a file
if best_model_state:
    torch.save(best_model_state, best_model_path)
print()
print("Best Hyperparameters:", best_hyperparameters)
print("Best Validation Loss:", best_validation_loss)
print()
# Load the best model for testing
best_model = RNN(input_size, best_hyperparameters['embedding_size'], best_hyperparameters['hidden_size'], best_hyperparameters['num_layers'], output_size).to(device)
best_model.load_state_dict(torch.load(best_model_path))

# Test the model
test_model(best_model, test_loader, device)


Best Hyperparameters: {'embedding_size': 100, 'hidden_size': 128, 'batch_size': 32, 'num_layers': 1, 'learning_rate': 0.005}
Best Validation Loss: 0.3742440557860947

Test Accuracy: 0.8361


**Best Hyperparameters:** {'embedding_size': 100, 'hidden_size': 128, 'batch_size': 32, 'num_layers': 1, 'learning_rate': 0.005}

**Best Validation Loss:** 0.3742440557860947

**Validation Accuracy:** 0.8361  

### 3. After RNNs
#### 3.1 Keeping all the parameters same, replace the RNN layer with the LSTM layer using nn.LSTM. What changes do you observe ? Explain why LSTM layer would affect performance.

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()  # Corrected super() call
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)  # Replaced RNN with LSTM
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        # Take the output from the last time step of the last layer
        last_output = output[:, -1, :]
        output = self.fc(last_output)
        return output.squeeze(1)  # Squeeze to make output shape (batch_size,)

In [None]:
# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.float().to(device)  # Convert labels to float
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        train_loss /= len(train_loader.dataset)

        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.float().to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)

                # Apply sigmoid activation and round to get predictions
                predicted = torch.round(torch.sigmoid(outputs))

                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_loss /= len(val_loader.dataset)
        val_accuracy = correct / total

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')


# Testing function
def test_model(model, test_loader, device):
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.float().to(device)  # Convert labels to float
            outputs = model(inputs)
            predicted = torch.round(torch.sigmoid(outputs))  # Apply sigmoid activation and round to get predictions
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    test_accuracy = correct / total
    print(f'Test Accuracy: {test_accuracy:.4f}')

In [None]:
# Hyperparameters
input_size = len(word_to_idx_train)
embedding_size = 100
hidden_size = 128
num_layers = 1  # Adjusted for simplicity
output_size = 1  # Output size changed to 1 for binary classification
learning_rate = 0.005
num_epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model, criterion, and optimizer
model2 = LSTMModel(input_size, embedding_size, hidden_size, num_layers, output_size).to(device)
criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy loss
optimizer = optim.Adam(model2.parameters(), lr=learning_rate)

In [None]:
# Train the model
train_model(model2, train_loader, val_loader, criterion, optimizer, num_epochs, device)

Epoch 1/5, Train Loss: 0.6956, Val Loss: 0.6929, Val Acc: 0.5122
Epoch 2/5, Train Loss: 0.6930, Val Loss: 0.6929, Val Acc: 0.5122
Epoch 3/5, Train Loss: 0.6930, Val Loss: 0.6932, Val Acc: 0.4878
Epoch 4/5, Train Loss: 0.6930, Val Loss: 0.6929, Val Acc: 0.5122
Epoch 5/5, Train Loss: 0.6930, Val Loss: 0.6929, Val Acc: 0.5122


In [None]:
# Test the model
test_model(model2, test_loader, device)

Test Accuracy: 0.5082


In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()  # Corrected super() call
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)  # Replaced RNN with LSTM
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        output_mean = torch.mean(output, dim=1)
        output = self.fc(output_mean)
        return output.squeeze(1)  # Ensure output shape is [batch_size]

In [None]:
# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.float().to(device)  # Convert labels to float
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        train_loss /= len(train_loader.dataset)

        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.float().to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)

                # Apply sigmoid activation and round to get predictions
                predicted = torch.round(torch.sigmoid(outputs))

                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_loss /= len(val_loader.dataset)
        val_accuracy = correct / total

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')


# Testing function
def test_model(model, test_loader, device):
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.float().to(device)  # Convert labels to float
            outputs = model(inputs)
            predicted = torch.round(torch.sigmoid(outputs))  # Apply sigmoid activation and round to get predictions
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    test_accuracy = correct / total
    print(f'Test Accuracy: {test_accuracy:.4f}')

In [None]:
# Hyperparameters
input_size = len(word_to_idx_train)
embedding_size = 100
hidden_size = 128
num_layers = 1  # Adjusted for simplicity
output_size = 1  # Output size changed to 1 for binary classification
learning_rate = 0.005
num_epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model, criterion, and optimizer
model = LSTMModel(input_size, embedding_size, hidden_size, num_layers, output_size).to(device)
criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy loss
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device)

Epoch 1/5, Train Loss: 0.5386, Val Loss: 0.4131, Val Acc: 0.8359
Epoch 2/5, Train Loss: 0.2362, Val Loss: 0.3274, Val Acc: 0.8666
Epoch 3/5, Train Loss: 0.1245, Val Loss: 0.3958, Val Acc: 0.8649
Epoch 4/5, Train Loss: 0.0667, Val Loss: 0.4853, Val Acc: 0.8511
Epoch 5/5, Train Loss: 0.0322, Val Loss: 0.5438, Val Acc: 0.8467


In [None]:
# Test the model
test_model(model, test_loader, device)

Test Accuracy: 0.8391


In [None]:
!ls -plia

total 45772
   786438 drwxr-xr-x 5 root root     4096 Mar 21 16:48 ./
271237110 drwxr-xr-x 5 root root     4096 Mar 21 13:34 ../
   786439 drwxr-xr-x 2 root root     4096 Mar 21 13:34 .virtual_documents/
   786520 -rw-r--r-- 1 root root 23421864 Mar 21 16:48 best_model.pt
   786519 -rw-r--r-- 1 root root 23421864 Mar 21 16:31 best_model.pth
   786485 drwxr-xr-x 3 root root     4096 Mar 21 13:36 corpora/
   786440 drwxr-xr-x 3 root root     4096 Mar 21 13:36 tokenizers/


**Incorporate stemming/lemmatization when doing text preprocessing using the NLTK library. What changes do you observe?**

Incorporating stemming or lemmatization using NLTK can help by:  

1. Normalization: Reducing words to their base forms, reducing vocabulary size.
2. Generalization: Treating different word forms as equivalent, improving model generalization.
3. Reduced Sparsity: Collapsing similar words, making training more efficient.
4. Loss of Information: Stemming or lemmatizing may lose some distinctions between words.
5. Computational Efficiency: Fewer unique words mean faster processing during training and inference.

Overall, stemming/lemmatization can improve text representations, reduce complexity, and enhance model performance, but may lose some word distinctions.

**Observation:** In the Model class, when only picking the last output of the RNN layer, the model might not capture the sequential information effectively, especially in longer sequences. This approach essentially treats each sequence as a single data point, ignoring the sequential nature of the data.  

On the other hand, when taking the mean of all outputs in the RNN layer, the model can capture the overall representation of the entire sequence. By considering the average representation of the entire sequence, the model might better capture the sequential patterns and dependencies present in the data.  

Therefore, by considering the mean of all outputs in the RNN layer, the model might achieve slightly better performance as it can leverage the sequential information more effectively compared to only picking the last output. This difference in performance is reflected in the slightly higher test accuracy obtained when taking the mean of all outputs compared to only picking the last output.







**Reason:** The main advantage of LSTM over traditional RNN is its ability to effectively handle the vanishing gradient problem, which occurs when training RNNs on long sequences. This problem arises because traditional RNNs have difficulty retaining information over many time steps due to the repeated multiplication of gradients during backpropagation. LSTMs address this issue by introducing a memory cell that can store information for long periods, allowing the network to learn dependencies over longer sequences without suffering from vanishing gradients.

Based on the provided results, we can analyze the performance of different hyperparameter configurations:

1. **Embedding Size**:
   - Increasing the embedding size from 100 to 200 or 300 generally improves performance across different batch sizes and hidden sizes.
   - Higher embedding sizes allow the model to capture more complex relationships between words, which can lead to better performance.

2. **Hidden Size**:
   - Increasing the hidden size from 128 to 256 or 512 also generally improves performance.
   - Larger hidden sizes allow the model to learn more complex patterns in the data, potentially leading to better performance.
   - However, very large hidden sizes may lead to overfitting, as seen in some cases where validation accuracy decreases.

3. **Batch Size**:
   - The effect of batch size is less consistent across different configurations.
   - In some cases, smaller batch sizes (e.g., 32) lead to better performance, while in others, larger batch sizes (e.g., 128) perform better.
   - Smaller batch sizes may allow the model to converge faster due to more frequent updates, but they can also lead to slower training overall.
   - Larger batch sizes may provide more stable gradients and faster training, but they may also lead to convergence to suboptimal solutions.

4. **Learning Rate**:
   - The learning rate is not explicitly varied in the provided results, so it's assumed to be constant.
   - It's important to note that the learning rate interacts with other hyperparameters and can significantly impact training dynamics and final performance.
   - A learning rate that's too high can lead to unstable training or divergence, while a learning rate that's too low can lead to slow convergence or getting stuck in local minima.

5. **Number of Layers**:
   - The number of layers is kept constant in the provided results (1 layer).
   - Experimenting with the number of layers could provide further insights into the model's capacity to learn complex representations, but it might also increase training time and risk overfitting.

6. **Validation Loss and Accuracy**:
   - Validation loss and accuracy are used as metrics to evaluate model performance.
   - Lower validation loss and higher accuracy indicate better model performance on unseen data.