In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
import time
from torch.utils.data import DataLoader
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data.dataset import random_split
from torch import nn
from torch.nn import functional as F
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


The code is the same as in the first file and we change the data loading to fit the new dataset:

```
imdb_data = pd.read_csv('/content/drive/MyDrive/IMDB_Dataset.csv')

imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})

train_data, test_data = train_test_split(imdb_data, test_size=0.2, random_state=42)

target_classes = ["Negative", "Positive"]

tokenizer = get_tokenizer("basic_english")

train_dataset = [(label, text) for text, label in zip(train_data['review'], train_data['sentiment'])]
test_dataset = [(label, text) for text, label in zip(test_data['review'], test_data['sentiment'])]
```



In [3]:
# -*- coding: utf-8 -*-
"""

A 1-direction RNN classifier applied to IMDB dataset

Download dataset:
https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

"""


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# HYPER-PARAMETERS
MAX_WORDS = 25
EPOCHS = 15
LEARNING_RATE = 1e-3
BATCH_SIZE = 1024
EMBEDDING_DIM = 100
HIDDEN_DIM = 64

######################################################################
# Read dataset files 
# ------------------


imdb_data = pd.read_csv('/content/drive/MyDrive/IMDB_Dataset.csv')

imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})

train_data, test_data = train_test_split(imdb_data, test_size=0.2, random_state=42)

target_classes = ["Negative", "Positive"]

tokenizer = get_tokenizer("basic_english")

train_dataset = [(label, text) for text, label in zip(train_data['review'], train_data['sentiment'])]
test_dataset = [(label, text) for text, label in zip(test_data['review'], test_data['sentiment'])]

######################################################################
# Data processing 
# -----------------------------





# All texts are truncated and padded to MAX_WORDS tokens
def collate_batch(batch):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y) 
    X = [vocab(tokenizer(text)) for text in X]
    # Bringing all samples to MAX_WORDS length. Shorter texts are padded with <PAD> sequences, longer texts are truncated.
    X = [tokens+([vocab['<PAD>']]* (MAX_WORDS-len(tokens))) if len(tokens)<MAX_WORDS else tokens[:MAX_WORDS] for tokens in X]
    return torch.tensor(X, dtype=torch.int32).to(device), Y.to(device) 

train_dataset = [(label, text) for text, label in zip(train_data['review'], train_data['sentiment'])]
test_dataset = [(label, text) for text, label in zip(test_data['review'], test_data['sentiment'])]

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)


def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

# Vocabulary includes all tokens with at least 10 occurrences in the texts
# Special tokens <PAD> and <UNK> are used for padding sequences and unknown words respectively
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>","<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

######################################################################
# Define the model
# ----------------


class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, output_dim)

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, hidden = self.rnn(embeddings)
        logits = self.linear(output[:,-1])  # The last output of RNN is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
    
######################################################################
# Initiate an instance of the model
# ---------------------------------


classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
# Define loss function and opimization algorithm
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([param for param in classifier.parameters() if param.requires_grad == True],lr=LEARNING_RATE)

# Count model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('\nModel:')
print(classifier)
print('Total parameters: ',count_parameters(classifier))
print('\n\n')

######################################################################
# Define functions to train and evaluate the model
# ------------------------------------------------


def EvaluateModel(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds1, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds1.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds1 = torch.cat(Y_preds1)
    
    # Returns mean loss, actual labels, predicted labels 
    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds1.detach().cpu().numpy()


def TrainModel(model, loss_fn, optimizer, train_loader, epochs):
    for i in range(1, epochs+1):
        model.train()
        print('Epoch:',i)
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds1 = model(X)

            loss = loss_fn(Y_preds1, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        
start_time = time.time()
TrainModel(classifier, loss_fn, optimizer, train_loader, EPOCHS)
end_time = time.time()
######################################################################
# Evaluate the model with test dataset
# ------------------------------------


_, Y_actual, Y_preds1 = EvaluateModel(classifier, loss_fn, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds1)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds1, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds1))

parameters1 = count_parameters(classifier)
average_time_per_epoch1 = (end_time - start_time)/EPOCHS
accuracy1 = accuracy_score(Y_actual, Y_preds1)



Model:
model(
  (embedding_layer): Embedding(29065, 100)
  (rnn): RNN(100, 64, batch_first=True)
  (linear): Linear(in_features=64, out_features=2, bias=True)
)
Total parameters:  2917254



Epoch: 1


100%|██████████| 40/40 [00:07<00:00,  5.67it/s]


Train Loss : 0.695
Epoch: 2


100%|██████████| 40/40 [00:07<00:00,  5.46it/s]


Train Loss : 0.687
Epoch: 3


100%|██████████| 40/40 [00:05<00:00,  6.88it/s]


Train Loss : 0.657
Epoch: 4


100%|██████████| 40/40 [00:12<00:00,  3.27it/s]


Train Loss : 0.628
Epoch: 5


100%|██████████| 40/40 [00:07<00:00,  5.39it/s]


Train Loss : 0.600
Epoch: 6


100%|██████████| 40/40 [00:05<00:00,  6.90it/s]


Train Loss : 0.575
Epoch: 7


100%|██████████| 40/40 [00:07<00:00,  5.29it/s]


Train Loss : 0.560
Epoch: 8


100%|██████████| 40/40 [00:05<00:00,  6.85it/s]


Train Loss : 0.548
Epoch: 9


100%|██████████| 40/40 [00:07<00:00,  5.55it/s]


Train Loss : 0.529
Epoch: 10


100%|██████████| 40/40 [00:06<00:00,  6.54it/s]


Train Loss : 0.518
Epoch: 11


100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


Train Loss : 0.504
Epoch: 12


100%|██████████| 40/40 [00:06<00:00,  6.10it/s]


Train Loss : 0.494
Epoch: 13


100%|██████████| 40/40 [00:08<00:00,  4.61it/s]


Train Loss : 0.486
Epoch: 14


100%|██████████| 40/40 [00:08<00:00,  4.45it/s]


Train Loss : 0.481
Epoch: 15


100%|██████████| 40/40 [00:06<00:00,  6.36it/s]


Train Loss : 0.470

Test Accuracy : 0.708

Classification Report : 
              precision    recall  f1-score   support

    Negative       0.69      0.75      0.72      4961
    Positive       0.73      0.66      0.70      5039

    accuracy                           0.71     10000
   macro avg       0.71      0.71      0.71     10000
weighted avg       0.71      0.71      0.71     10000


Confusion Matrix : 
[[3733 1228]
 [1691 3348]]


In [4]:
# -*- coding: utf-8 -*-
"""

A bi-direction RNN classifier applied to IMDB dataset

Download dataset:
https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

"""


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# HYPER-PARAMETERS
MAX_WORDS = 25
EPOCHS = 15
LEARNING_RATE = 1e-3
BATCH_SIZE = 1024
EMBEDDING_DIM = 100
HIDDEN_DIM = 64

######################################################################
# Read dataset files 
# ------------------


imdb_data = pd.read_csv('/content/drive/MyDrive/IMDB_Dataset.csv')

imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})

train_data, test_data = train_test_split(imdb_data, test_size=0.2, random_state=42)

target_classes = ["Negative", "Positive"]

tokenizer = get_tokenizer("basic_english")

train_dataset = [(label, text) for text, label in zip(train_data['review'], train_data['sentiment'])]
test_dataset = [(label, text) for text, label in zip(test_data['review'], test_data['sentiment'])]

######################################################################
# Data processing 
# -----------------------------


tokenizer = get_tokenizer("basic_english")

# All texts are truncated and padded to MAX_WORDS tokens
def collate_batch(batch):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y) 
    X = [vocab(tokenizer(text)) for text in X]
    # Bringing all samples to MAX_WORDS length. Shorter texts are padded with <PAD> sequences, longer texts are truncated.
    X = [tokens+([vocab['<PAD>']]* (MAX_WORDS-len(tokens))) if len(tokens)<MAX_WORDS else tokens[:MAX_WORDS] for tokens in X]
    return torch.tensor(X, dtype=torch.int32).to(device), Y.to(device) 

train_dataset = [(label, text) for text, label in zip(train_data['review'], train_data['sentiment'])]
test_dataset = [(label, text) for text, label in zip(test_data['review'], test_data['sentiment'])]

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)



def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

# Vocabulary includes all tokens with at least 10 occurrences in the texts
# Special tokens <PAD> and <UNK> are used for padding sequences and unknown words respectively
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>","<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

######################################################################
# Define the model
# ----------------


class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True, bidirectional=True) # Change in code bidirectional = True
        self.linear = nn.Linear(hidden_dim * 2, output_dim)  # Change in code - multiply by 2 since it's bidirectional
        self.hidden_dim = hidden_dim  # Change in code - set the hidden_dim attribute

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, _ = self.rnn(embeddings)
        forward_output = output[:, -1, :self.hidden_dim]  # Change in code - last output of the forward RNN
        backward_output = output[:, 0, self.hidden_dim:]  # Change in code - first output of the backward RNN (since it starts from the end)
        concatenated_output = torch.cat((forward_output, backward_output), dim=1) # Change in code - concatenate the output
        logits = self.linear(concatenated_output) # Change in code - The last output of the forward and the first output of the backward is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
    
######################################################################
# Initiate an instance of the model
# ---------------------------------


classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
# Define loss function and opimization algorithm
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([param for param in classifier.parameters() if param.requires_grad == True],lr=LEARNING_RATE)

# Count model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('\nModel:')
print(classifier)
print('Total parameters: ',count_parameters(classifier))
print('\n\n')

######################################################################
# Define functions to train and evaluate the model
# ------------------------------------------------


def EvaluateModel(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds2, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds2.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds2 = torch.cat(Y_preds2)
    
    # Returns mean loss, actual labels, predicted labels 
    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds2.detach().cpu().numpy()


def TrainModel(model, loss_fn, optimizer, train_loader, epochs):
    for i in range(1, epochs+1):
        model.train()
        print('Epoch:',i)
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds2 = model(X)

            loss = loss_fn(Y_preds2, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        
start_time = time.time()
TrainModel(classifier, loss_fn, optimizer, train_loader, EPOCHS)
end_time = time.time()
######################################################################
# Evaluate the model with test dataset
# ------------------------------------


_, Y_actual, Y_preds2 = EvaluateModel(classifier, loss_fn, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds2)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds2, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds2))

parameters2 = count_parameters(classifier)
average_time_per_epoch2 = (end_time - start_time)/EPOCHS
accuracy2 = accuracy_score(Y_actual, Y_preds2)



Model:
model(
  (embedding_layer): Embedding(29065, 100)
  (rnn): RNN(100, 64, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=128, out_features=2, bias=True)
)
Total parameters:  2928006



Epoch: 1


100%|██████████| 40/40 [00:10<00:00,  3.81it/s]


Train Loss : 0.689
Epoch: 2


100%|██████████| 40/40 [00:06<00:00,  5.75it/s]


Train Loss : 0.664
Epoch: 3


100%|██████████| 40/40 [00:07<00:00,  5.30it/s]


Train Loss : 0.634
Epoch: 4


100%|██████████| 40/40 [00:06<00:00,  5.73it/s]


Train Loss : 0.607
Epoch: 5


100%|██████████| 40/40 [00:07<00:00,  5.33it/s]


Train Loss : 0.583
Epoch: 6


100%|██████████| 40/40 [00:06<00:00,  6.59it/s]


Train Loss : 0.563
Epoch: 7


100%|██████████| 40/40 [00:07<00:00,  5.25it/s]


Train Loss : 0.542
Epoch: 8


100%|██████████| 40/40 [00:06<00:00,  6.57it/s]


Train Loss : 0.523
Epoch: 9


100%|██████████| 40/40 [00:07<00:00,  5.28it/s]


Train Loss : 0.511
Epoch: 10


100%|██████████| 40/40 [00:06<00:00,  6.37it/s]


Train Loss : 0.494
Epoch: 11


100%|██████████| 40/40 [00:07<00:00,  5.04it/s]


Train Loss : 0.482
Epoch: 12


100%|██████████| 40/40 [00:06<00:00,  6.51it/s]


Train Loss : 0.470
Epoch: 13


100%|██████████| 40/40 [00:07<00:00,  5.25it/s]


Train Loss : 0.459
Epoch: 14


100%|██████████| 40/40 [00:06<00:00,  6.60it/s]


Train Loss : 0.449
Epoch: 15


100%|██████████| 40/40 [00:07<00:00,  5.03it/s]


Train Loss : 0.443

Test Accuracy : 0.703

Classification Report : 
              precision    recall  f1-score   support

    Negative       0.69      0.74      0.71      4961
    Positive       0.72      0.67      0.69      5039

    accuracy                           0.70     10000
   macro avg       0.70      0.70      0.70     10000
weighted avg       0.70      0.70      0.70     10000


Confusion Matrix : 
[[3667 1294]
 [1680 3359]]


In [5]:
# -*- coding: utf-8 -*-
"""

A bi-direction RNN classifier with 2 layers applied to IMDB dataset

Download dataset:
https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

"""

# HYPER-PARAMETERS
MAX_WORDS = 25
EPOCHS = 15
LEARNING_RATE = 1e-3
BATCH_SIZE = 1024
EMBEDDING_DIM = 100
HIDDEN_DIM = 64

######################################################################
# Read dataset files 
# ------------------


imdb_data = pd.read_csv('/content/drive/MyDrive/IMDB_Dataset.csv')

imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})

train_data, test_data = train_test_split(imdb_data, test_size=0.2, random_state=42)

target_classes = ["Negative", "Positive"]

tokenizer = get_tokenizer("basic_english")

train_dataset = [(label, text) for text, label in zip(train_data['review'], train_data['sentiment'])]
test_dataset = [(label, text) for text, label in zip(test_data['review'], test_data['sentiment'])]

######################################################################
# Data processing 
# -----------------------------


tokenizer = get_tokenizer("basic_english")

# All texts are truncated and padded to MAX_WORDS tokens
def collate_batch(batch):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y) 
    X = [vocab(tokenizer(text)) for text in X]
    # Bringing all samples to MAX_WORDS length. Shorter texts are padded with <PAD> sequences, longer texts are truncated.
    X = [tokens+([vocab['<PAD>']]* (MAX_WORDS-len(tokens))) if len(tokens)<MAX_WORDS else tokens[:MAX_WORDS] for tokens in X]
    return torch.tensor(X, dtype=torch.int32).to(device), Y.to(device) 

train_dataset = [(label, text) for text, label in zip(train_data['review'], train_data['sentiment'])]
test_dataset = [(label, text) for text, label in zip(test_data['review'], test_data['sentiment'])]

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)



def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

# Vocabulary includes all tokens with at least 10 occurrences in the texts
# Special tokens <PAD> and <UNK> are used for padding sequences and unknown words respectively
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>","<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

######################################################################
# Define the model
# ----------------


class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=2, batch_first=True, bidirectional=True)  # Change in code bidirectional = True, num_layers = 2
        self.linear = nn.Linear(hidden_dim * 2, output_dim)  # Change in code - multiply by 2 since it's bidirectional
        self.hidden_dim = hidden_dim  # Change in code - set the hidden_dim attribute

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, _ = self.rnn(embeddings)
        forward_output = output[:, -1, :self.hidden_dim]  # Change in code - last output of the forward RNN
        backward_output = output[:, 0, self.hidden_dim:]  # Change in code - first output of the backward RNN (since it starts from the end)
        concatenated_output = torch.cat((forward_output, backward_output), dim=1) # Change in code - concatenate the output
        logits = self.linear(concatenated_output) # The last output of the forward and the first output of the backward is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
    
######################################################################
# Initiate an instance of the model
# ---------------------------------


classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
# Define loss function and opimization algorithm
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([param for param in classifier.parameters() if param.requires_grad == True],lr=LEARNING_RATE)

# Count model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('\nModel:')
print(classifier)
print('Total parameters: ',count_parameters(classifier))
print('\n\n')

######################################################################
# Define functions to train and evaluate the model
# ------------------------------------------------


def EvaluateModel(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds3, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds3.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds3 = torch.cat(Y_preds3)
    
    # Returns mean loss, actual labels, predicted labels 
    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds3.detach().cpu().numpy()


def TrainModel(model, loss_fn, optimizer, train_loader, epochs):
    for i in range(1, epochs+1):
        model.train()
        print('Epoch:',i)
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds3 = model(X)

            loss = loss_fn(Y_preds3, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        
start_time = time.time()
TrainModel(classifier, loss_fn, optimizer, train_loader, EPOCHS)
end_time = time.time()
######################################################################
# Evaluate the model with test dataset
# ------------------------------------


_, Y_actual, Y_preds3 = EvaluateModel(classifier, loss_fn, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds3)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds3, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds3))

parameters3 = count_parameters(classifier)
average_time_per_epoch3 = (end_time - start_time)/EPOCHS
accuracy3 = accuracy_score(Y_actual, Y_preds3)




Model:
model(
  (embedding_layer): Embedding(29065, 100)
  (rnn): RNN(100, 64, num_layers=2, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=128, out_features=2, bias=True)
)
Total parameters:  2952838



Epoch: 1


100%|██████████| 40/40 [00:06<00:00,  6.30it/s]


Train Loss : 0.688
Epoch: 2


100%|██████████| 40/40 [00:07<00:00,  5.16it/s]


Train Loss : 0.660
Epoch: 3


100%|██████████| 40/40 [00:06<00:00,  6.37it/s]


Train Loss : 0.628
Epoch: 4


100%|██████████| 40/40 [00:08<00:00,  4.93it/s]


Train Loss : 0.601
Epoch: 5


100%|██████████| 40/40 [00:06<00:00,  6.39it/s]


Train Loss : 0.573
Epoch: 6


100%|██████████| 40/40 [00:07<00:00,  5.17it/s]


Train Loss : 0.553
Epoch: 7


100%|██████████| 40/40 [00:06<00:00,  6.36it/s]


Train Loss : 0.533
Epoch: 8


100%|██████████| 40/40 [00:07<00:00,  5.06it/s]


Train Loss : 0.519
Epoch: 9


100%|██████████| 40/40 [00:06<00:00,  6.31it/s]


Train Loss : 0.505
Epoch: 10


100%|██████████| 40/40 [00:08<00:00,  4.68it/s]


Train Loss : 0.487
Epoch: 11


100%|██████████| 40/40 [00:06<00:00,  5.75it/s]


Train Loss : 0.477
Epoch: 12


100%|██████████| 40/40 [00:06<00:00,  5.72it/s]


Train Loss : 0.463
Epoch: 13


100%|██████████| 40/40 [00:07<00:00,  5.43it/s]


Train Loss : 0.451
Epoch: 14


100%|██████████| 40/40 [00:06<00:00,  5.95it/s]


Train Loss : 0.447
Epoch: 15


100%|██████████| 40/40 [00:07<00:00,  5.34it/s]


Train Loss : 0.435

Test Accuracy : 0.707

Classification Report : 
              precision    recall  f1-score   support

    Negative       0.71      0.70      0.70      4961
    Positive       0.71      0.72      0.71      5039

    accuracy                           0.71     10000
   macro avg       0.71      0.71      0.71     10000
weighted avg       0.71      0.71      0.71     10000


Confusion Matrix : 
[[3449 1512]
 [1415 3624]]


In [6]:
# -*- coding: utf-8 -*-
"""

A 1 direction LSTM classifier applied to IMDB dataset

Download dataset:
https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

"""

import torch
import time 
from torch.utils.data import DataLoader
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data.dataset import random_split
from torch import nn
from torch.nn import functional as F
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# HYPER-PARAMETERS
MAX_WORDS = 25
EPOCHS = 15
LEARNING_RATE = 1e-3
BATCH_SIZE = 1024
EMBEDDING_DIM = 100
HIDDEN_DIM = 64

######################################################################
# Read dataset files 
# ------------------

imdb_data = pd.read_csv('/content/drive/MyDrive/IMDB_Dataset.csv')

imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})

train_data, test_data = train_test_split(imdb_data, test_size=0.2, random_state=42)

target_classes = ["Negative", "Positive"]

tokenizer = get_tokenizer("basic_english")

train_dataset = [(label, text) for text, label in zip(train_data['review'], train_data['sentiment'])]
test_dataset = [(label, text) for text, label in zip(test_data['review'], test_data['sentiment'])]

######################################################################
# Data processing 
# -----------------------------


tokenizer = get_tokenizer("basic_english")

# All texts are truncated and padded to MAX_WORDS tokens
def collate_batch(batch):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y) 
    X = [vocab(tokenizer(text)) for text in X]
    # Bringing all samples to MAX_WORDS length. Shorter texts are padded with <PAD> sequences, longer texts are truncated.
    X = [tokens+([vocab['<PAD>']]* (MAX_WORDS-len(tokens))) if len(tokens)<MAX_WORDS else tokens[:MAX_WORDS] for tokens in X]
    return torch.tensor(X, dtype=torch.int32).to(device), Y.to(device) 

train_dataset = [(label, text) for text, label in zip(train_data['review'], train_data['sentiment'])]
test_dataset = [(label, text) for text, label in zip(test_data['review'], test_data['sentiment'])]

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)


def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

# Vocabulary includes all tokens with at least 10 occurrences in the texts
# Special tokens <PAD> and <UNK> are used for padding sequences and unknown words respectively
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>","<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

######################################################################
# Define the model
# ----------------


class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True) # Change in code - RNN to LSTM
        self.linear = nn.Linear(hidden_dim, output_dim)

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, hidden = self.lstm(embeddings)
        logits = self.linear(output[:,-1])  # The last output of LSTM is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
    
######################################################################
# Initiate an instance of the model
# ---------------------------------


classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
# Define loss function and opimization algorithm
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([param for param in classifier.parameters() if param.requires_grad == True],lr=LEARNING_RATE)

# Count model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('\nModel:')
print(classifier)
print('Total parameters: ',count_parameters(classifier))
print('\n\n')

######################################################################
# Define functions to train and evaluate the model
# ------------------------------------------------


def EvaluateModel(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds4, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds4.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds4 = torch.cat(Y_preds4)
    
    # Returns mean loss, actual labels, predicted labels 
    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds4.detach().cpu().numpy()


def TrainModel(model, loss_fn, optimizer, train_loader, epochs):
    for i in range(1, epochs+1):
        model.train()
        print('Epoch:',i)
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds4 = model(X)

            loss = loss_fn(Y_preds4, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        
start_time = time.time()
TrainModel(classifier, loss_fn, optimizer, train_loader, EPOCHS)
end_time = time.time()
######################################################################
# Evaluate the model with test dataset
# ------------------------------------


_, Y_actual, Y_preds4 = EvaluateModel(classifier, loss_fn, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds4)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds4, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds4))

parameters4 = count_parameters(classifier)
average_time_per_epoch4 = (end_time - start_time)/EPOCHS
accuracy4 = accuracy_score(Y_actual, Y_preds4)



Model:
model(
  (embedding_layer): Embedding(29065, 100)
  (lstm): LSTM(100, 64, batch_first=True)
  (linear): Linear(in_features=64, out_features=2, bias=True)
)
Total parameters:  2949126



Epoch: 1


100%|██████████| 40/40 [00:07<00:00,  5.20it/s]


Train Loss : 0.692
Epoch: 2


100%|██████████| 40/40 [00:06<00:00,  6.40it/s]


Train Loss : 0.676
Epoch: 3


100%|██████████| 40/40 [00:07<00:00,  5.23it/s]


Train Loss : 0.627
Epoch: 4


100%|██████████| 40/40 [00:06<00:00,  6.46it/s]


Train Loss : 0.584
Epoch: 5


100%|██████████| 40/40 [00:07<00:00,  5.28it/s]


Train Loss : 0.556
Epoch: 6


100%|██████████| 40/40 [00:06<00:00,  6.26it/s]


Train Loss : 0.537
Epoch: 7


100%|██████████| 40/40 [00:09<00:00,  4.01it/s]


Train Loss : 0.522
Epoch: 8


100%|██████████| 40/40 [00:10<00:00,  3.77it/s]


Train Loss : 0.502
Epoch: 9


100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


Train Loss : 0.489
Epoch: 10


100%|██████████| 40/40 [00:07<00:00,  5.01it/s]


Train Loss : 0.475
Epoch: 11


100%|██████████| 40/40 [00:09<00:00,  4.25it/s]


Train Loss : 0.465
Epoch: 12


100%|██████████| 40/40 [00:07<00:00,  5.01it/s]


Train Loss : 0.457
Epoch: 13


100%|██████████| 40/40 [00:06<00:00,  5.83it/s]


Train Loss : 0.442
Epoch: 14


100%|██████████| 40/40 [00:07<00:00,  5.49it/s]


Train Loss : 0.439
Epoch: 15


100%|██████████| 40/40 [00:06<00:00,  6.08it/s]


Train Loss : 0.430

Test Accuracy : 0.717

Classification Report : 
              precision    recall  f1-score   support

    Negative       0.70      0.74      0.72      4961
    Positive       0.73      0.69      0.71      5039

    accuracy                           0.72     10000
   macro avg       0.72      0.72      0.72     10000
weighted avg       0.72      0.72      0.72     10000


Confusion Matrix : 
[[3675 1286]
 [1547 3492]]


In [7]:
# -*- coding: utf-8 -*-
"""

A bi-direction LSTM classifier applied to IMDB dataset

Download dataset:
https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

"""

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# HYPER-PARAMETERS
MAX_WORDS = 25
EPOCHS = 15
LEARNING_RATE = 1e-3
BATCH_SIZE = 1024
EMBEDDING_DIM = 100
HIDDEN_DIM = 64

######################################################################
# Read dataset files 
# ------------------

imdb_data = pd.read_csv('/content/drive/MyDrive/IMDB_Dataset.csv')

imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})

train_data, test_data = train_test_split(imdb_data, test_size=0.2, random_state=42)

target_classes = ["Negative", "Positive"]

tokenizer = get_tokenizer("basic_english")

train_dataset = [(label, text) for text, label in zip(train_data['review'], train_data['sentiment'])]
test_dataset = [(label, text) for text, label in zip(test_data['review'], test_data['sentiment'])]

######################################################################
# Data processing 
# -----------------------------


tokenizer = get_tokenizer("basic_english")

# All texts are truncated and padded to MAX_WORDS tokens
def collate_batch(batch):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y) 
    X = [vocab(tokenizer(text)) for text in X]
    # Bringing all samples to MAX_WORDS length. Shorter texts are padded with <PAD> sequences, longer texts are truncated.
    X = [tokens+([vocab['<PAD>']]* (MAX_WORDS-len(tokens))) if len(tokens)<MAX_WORDS else tokens[:MAX_WORDS] for tokens in X]
    return torch.tensor(X, dtype=torch.int32).to(device), Y.to(device) 

train_dataset = [(label, text) for text, label in zip(train_data['review'], train_data['sentiment'])]
test_dataset = [(label, text) for text, label in zip(test_data['review'], test_data['sentiment'])]

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)



def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

# Vocabulary includes all tokens with at least 10 occurrences in the texts
# Special tokens <PAD> and <UNK> are used for padding sequences and unknown words respectively
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>","<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

######################################################################
# Define the model
# ----------------


class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True, bidirectional=True) # Change in code bidirectional = True, RNN to LSTM
        self.linear = nn.Linear(hidden_dim * 2, output_dim)  # Change in code - multiply by 2 since it's bidirectional
        self.hidden_dim = hidden_dim  # Change in code - set the hidden_dim attribute

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, _ = self.lstm(embeddings) # Change in code - rnn to lstm
        forward_output = output[:, -1, :self.hidden_dim]  # Change in code - last output of the forward LSTM
        backward_output = output[:, 0, self.hidden_dim:]  # Change in code - first output of the backward LSTM (since it starts from the end)
        concatenated_output = torch.cat((forward_output, backward_output), dim=1) # Change in code - concatenate the output
        logits = self.linear(concatenated_output) # Change in code - The last output of the forward and the first output of the backward is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
    
######################################################################
# Initiate an instance of the model
# ---------------------------------


classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
# Define loss function and opimization algorithm
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([param for param in classifier.parameters() if param.requires_grad == True],lr=LEARNING_RATE)

# Count model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('\nModel:')
print(classifier)
print('Total parameters: ',count_parameters(classifier))
print('\n\n')

######################################################################
# Define functions to train and evaluate the model
# ------------------------------------------------


def EvaluateModel(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds5, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds5.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds5 = torch.cat(Y_preds5)
    
    # Returns mean loss, actual labels, predicted labels 
    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds5.detach().cpu().numpy()


def TrainModel(model, loss_fn, optimizer, train_loader, epochs):
    for i in range(1, epochs+1):
        model.train()
        print('Epoch:',i)
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds5 = model(X)

            loss = loss_fn(Y_preds5, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        
start_time = time.time()
TrainModel(classifier, loss_fn, optimizer, train_loader, EPOCHS)
end_time = time.time()
######################################################################
# Evaluate the model with test dataset
# ------------------------------------


_, Y_actual, Y_preds5 = EvaluateModel(classifier, loss_fn, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds5)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds5, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds5))


parameters5 = count_parameters(classifier)
average_time_per_epoch5 = (end_time - start_time)/EPOCHS
accuracy5 = accuracy_score(Y_actual, Y_preds5)



Model:
model(
  (embedding_layer): Embedding(29065, 100)
  (lstm): LSTM(100, 64, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=128, out_features=2, bias=True)
)
Total parameters:  2991750



Epoch: 1


100%|██████████| 40/40 [00:06<00:00,  5.97it/s]


Train Loss : 0.686
Epoch: 2


100%|██████████| 40/40 [00:09<00:00,  4.33it/s]


Train Loss : 0.647
Epoch: 3


100%|██████████| 40/40 [00:07<00:00,  5.22it/s]


Train Loss : 0.606
Epoch: 4


100%|██████████| 40/40 [00:06<00:00,  6.13it/s]


Train Loss : 0.574
Epoch: 5


100%|██████████| 40/40 [00:07<00:00,  5.08it/s]


Train Loss : 0.549
Epoch: 6


100%|██████████| 40/40 [00:06<00:00,  6.23it/s]


Train Loss : 0.527
Epoch: 7


100%|██████████| 40/40 [00:07<00:00,  5.04it/s]


Train Loss : 0.510
Epoch: 8


100%|██████████| 40/40 [00:06<00:00,  6.24it/s]


Train Loss : 0.491
Epoch: 9


100%|██████████| 40/40 [00:07<00:00,  5.11it/s]


Train Loss : 0.475
Epoch: 10


100%|██████████| 40/40 [00:06<00:00,  6.22it/s]


Train Loss : 0.458
Epoch: 11


100%|██████████| 40/40 [00:07<00:00,  5.07it/s]


Train Loss : 0.444
Epoch: 12


100%|██████████| 40/40 [00:06<00:00,  6.16it/s]


Train Loss : 0.436
Epoch: 13


100%|██████████| 40/40 [00:11<00:00,  3.63it/s]


Train Loss : 0.422
Epoch: 14


100%|██████████| 40/40 [00:08<00:00,  4.67it/s]


Train Loss : 0.413
Epoch: 15


100%|██████████| 40/40 [00:06<00:00,  6.15it/s]


Train Loss : 0.407

Test Accuracy : 0.718

Classification Report : 
              precision    recall  f1-score   support

    Negative       0.71      0.73      0.72      4961
    Positive       0.73      0.71      0.72      5039

    accuracy                           0.72     10000
   macro avg       0.72      0.72      0.72     10000
weighted avg       0.72      0.72      0.72     10000


Confusion Matrix : 
[[3621 1340]
 [1477 3562]]


In [8]:
# -*- coding: utf-8 -*-
"""

A bi-direction LSTM classifier with 2 layers applied to IMDB dataset

Download dataset:
https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

"""

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# HYPER-PARAMETERS
MAX_WORDS = 25
EPOCHS = 15
LEARNING_RATE = 1e-3
BATCH_SIZE = 1024
EMBEDDING_DIM = 100
HIDDEN_DIM = 64

######################################################################
# Read dataset files 
# ------------------


imdb_data = pd.read_csv('/content/drive/MyDrive/IMDB_Dataset.csv')

imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})

train_data, test_data = train_test_split(imdb_data, test_size=0.2, random_state=42)

target_classes = ["Negative", "Positive"]

tokenizer = get_tokenizer("basic_english")

train_dataset = [(label, text) for text, label in zip(train_data['review'], train_data['sentiment'])]
test_dataset = [(label, text) for text, label in zip(test_data['review'], test_data['sentiment'])]

######################################################################
# Data processing 
# -----------------------------


tokenizer = get_tokenizer("basic_english")

# All texts are truncated and padded to MAX_WORDS tokens
def collate_batch(batch):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y) 
    X = [vocab(tokenizer(text)) for text in X]
    # Bringing all samples to MAX_WORDS length. Shorter texts are padded with <PAD> sequences, longer texts are truncated.
    X = [tokens+([vocab['<PAD>']]* (MAX_WORDS-len(tokens))) if len(tokens)<MAX_WORDS else tokens[:MAX_WORDS] for tokens in X]
    return torch.tensor(X, dtype=torch.int32).to(device), Y.to(device) 

train_dataset = [(label, text) for text, label in zip(train_data['review'], train_data['sentiment'])]
test_dataset = [(label, text) for text, label in zip(test_data['review'], test_data['sentiment'])]

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)



def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

# Vocabulary includes all tokens with at least 10 occurrences in the texts
# Special tokens <PAD> and <UNK> are used for padding sequences and unknown words respectively
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>","<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

######################################################################
# Define the model
# ----------------


class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=2, batch_first=True, bidirectional=True)  # Change in code bidirectional = True, num_layers = 2, RNN to LSTM
        self.linear = nn.Linear(hidden_dim * 2, output_dim)  # Change in code - multiply by 2 since it's bidirectional
        self.hidden_dim = hidden_dim  # Change in code - set the hidden_dim attribute

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, _ = self.lstm(embeddings) # Change in code - rnn to lstm
        forward_output = output[:, -1, :self.hidden_dim]  # Change in code - last output of the forward LSTM
        backward_output = output[:, 0, self.hidden_dim:]  # Change in code - first output of the backward LSTM (since it starts from the end)
        concatenated_output = torch.cat((forward_output, backward_output), dim=1) # Change in code - concatenate the output
        logits = self.linear(concatenated_output) # Change in code - The last output of the forward and the first output of the backward is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
    
######################################################################
# Initiate an instance of the model
# ---------------------------------


classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
# Define loss function and opimization algorithm
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([param for param in classifier.parameters() if param.requires_grad == True],lr=LEARNING_RATE)

# Count model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('\nModel:')
print(classifier)
print('Total parameters: ',count_parameters(classifier))
print('\n\n')

######################################################################
# Define functions to train and evaluate the model
# ------------------------------------------------


def EvaluateModel(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds6, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds6.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds6 = torch.cat(Y_preds6)
    
    # Returns mean loss, actual labels, predicted labels 
    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds6.detach().cpu().numpy()


def TrainModel(model, loss_fn, optimizer, train_loader, epochs):
    for i in range(1, epochs+1):
        model.train()
        print('Epoch:',i)
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds6 = model(X)

            loss = loss_fn(Y_preds6, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        
start_time = time.time()
TrainModel(classifier, loss_fn, optimizer, train_loader, EPOCHS)
end_time = time.time()

######################################################################
# Evaluate the model with test dataset
# ------------------------------------


_, Y_actual, Y_preds6 = EvaluateModel(classifier, loss_fn, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds6)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds6, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds6))

parameters6 = count_parameters(classifier)
average_time_per_epoch6 = (end_time - start_time)/EPOCHS
accuracy6 = accuracy_score(Y_actual, Y_preds6)



Model:
model(
  (embedding_layer): Embedding(29065, 100)
  (lstm): LSTM(100, 64, num_layers=2, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=128, out_features=2, bias=True)
)
Total parameters:  3091078



Epoch: 1


100%|██████████| 40/40 [00:08<00:00,  4.88it/s]


Train Loss : 0.679
Epoch: 2


100%|██████████| 40/40 [00:07<00:00,  5.71it/s]


Train Loss : 0.628
Epoch: 3


100%|██████████| 40/40 [00:08<00:00,  4.77it/s]


Train Loss : 0.591
Epoch: 4


100%|██████████| 40/40 [00:06<00:00,  5.76it/s]


Train Loss : 0.561
Epoch: 5


100%|██████████| 40/40 [00:08<00:00,  4.83it/s]


Train Loss : 0.538
Epoch: 6


100%|██████████| 40/40 [00:07<00:00,  5.44it/s]


Train Loss : 0.521
Epoch: 7


100%|██████████| 40/40 [00:08<00:00,  4.93it/s]


Train Loss : 0.497
Epoch: 8


100%|██████████| 40/40 [00:08<00:00,  4.76it/s]


Train Loss : 0.479
Epoch: 9


100%|██████████| 40/40 [00:07<00:00,  5.48it/s]


Train Loss : 0.467
Epoch: 10


100%|██████████| 40/40 [00:08<00:00,  4.67it/s]


Train Loss : 0.455
Epoch: 11


100%|██████████| 40/40 [00:10<00:00,  3.90it/s]


Train Loss : 0.444
Epoch: 12


100%|██████████| 40/40 [00:07<00:00,  5.27it/s]


Train Loss : 0.434
Epoch: 13


100%|██████████| 40/40 [00:08<00:00,  4.77it/s]


Train Loss : 0.429
Epoch: 14


100%|██████████| 40/40 [00:07<00:00,  5.68it/s]


Train Loss : 0.422
Epoch: 15


100%|██████████| 40/40 [00:08<00:00,  4.83it/s]


Train Loss : 0.425

Test Accuracy : 0.722

Classification Report : 
              precision    recall  f1-score   support

    Negative       0.72      0.73      0.72      4961
    Positive       0.73      0.72      0.72      5039

    accuracy                           0.72     10000
   macro avg       0.72      0.72      0.72     10000
weighted avg       0.72      0.72      0.72     10000


Confusion Matrix : 
[[3601 1360]
 [1416 3623]]


The results of text that were missclasified by all models and the most frequent pair of correct category and wrong prediction. The matrix of the accuracy, parameters and time cost per epoch of all the above models is below this block.

In [9]:
import numpy as np

predictions = list(zip(Y_preds1, Y_preds2, Y_preds3, Y_preds4, Y_preds5, Y_preds6))


misclassified_indices = []
for i, (pred1, pred2, pred3, pred4, pred5, pred6) in enumerate(predictions):
    if (pred1 != Y_actual[i]) and (pred2 != Y_actual[i]) and (pred3 != Y_actual[i]) and (pred4 != Y_actual[i]) and (pred5 != Y_actual[i]) and (pred6 != Y_actual[i]):
        misclassified_indices.append(i)


misclassified_counts = {category: 0 for category in np.unique(Y_actual)}


pair_counts = {}

for idx in misclassified_indices:
    correct_category = Y_actual[idx]
    misclassified_counts[correct_category] += 1

    pair = (correct_category, Y_preds1[idx])  

    if pair in pair_counts:
        pair_counts[pair] += 1
    else:
        pair_counts[pair] = 1

label_to_category = {
    "0": "0:Positive",
    "1": "1:Sports",
}


misclassified_index = misclassified_indices[0]


misclassified_text = test_dataset[misclassified_index][1]


correct_category = Y_actual[misclassified_index]
predicted_category = Y_preds1[misclassified_index]


print(f"Misclassified Text (Index: {misclassified_index}):")
print(misclassified_text)
print(f"\nShould have been classified as: {label_to_category[str(correct_category)]}")
print(f"\nWas classified as: {label_to_category[str(predicted_category)]}\n")

for category in sorted(misclassified_counts.keys()):
    category_name = label_to_category[str(category)]
    count = misclassified_counts[category]
    print(f"{category_name}: {count} samples")


most_frequent_pair = max(pair_counts, key=pair_counts.get)
print("\nThe most frequent pair of correct category and wrong prediction:")
print(f"Correct category: {label_to_category[str(most_frequent_pair[0])]}, Wrong prediction: {label_to_category[str(most_frequent_pair[1])]}, Occurrences: {pair_counts[most_frequent_pair]}")



Misclassified Text (Index: 7):
Okay, I didn't get the Purgatory thing the first time I watched this episode. It seemed like something significant was going on that I couldn't put my finger on. This time those Costa Mesa fires on TV really caught my attention- and it helped that I was just writing an essay on Inferno! But let me see what HASN'T been discussed yet...<br /><br />A TWOP review mentioned that Tony had 7 flights of stairs to go down because of the broken elevator. Yeah, 7 is a significant number for lots of reasons, especially religious, but here's one more for ya. On a hunch I consulted wikipedia, and guess what Dante divided into 7 levels? Purgatorio. Excluding ante-Purgatory and Paradise. (The stuff at the bottom of the stairs and... what Tony can't get to.) <br /><br />On to the allegedly "random" monk-slap scene. As soon as the monks appeared, it fit perfectly in place with Tony trying to get out of Purgatory. You can tell he got worried when that Christian commercial (

From the results on the new IMDB dataset, we can observe that the 1-layer RNN has an accuracy of 0.708, while the 1-layer bidirectional RNN has a slightly lower accuracy of 0.703. The highest accuracy is achieved by the 2-layer bidirectional LSTM, with an accuracy of 0.722. Similar to the previous dataset, bidirectional models generally outperform their unidirectional counterparts, indicating the advantage of leveraging bidirectional architectures to capture information from both past and future contexts.

In terms of model complexity, as we transition from simpler architectures like RNNs to more advanced ones like bidirectional LSTMs, the number of parameters increases. The 1-layer RNN model has the lowest time cost per epoch at 7.625 seconds, while the 2-layer bidirectional LSTM model has the highest time cost per epoch of 8.014 seconds.

In [10]:
import pandas as pd

def format_values(value):
    if isinstance(value, float):
        if value.is_integer():
            return f"{int(value):,}"
        else:
            return f"{value:.3f}"
    return value



data = {
    "1RNN": [accuracy1, parameters1, average_time_per_epoch1],
    "1-Bi-RNN": [accuracy2, parameters2, average_time_per_epoch2],
    "2-Bi-RNN": [accuracy3, parameters3, average_time_per_epoch3],
    "1LSTM": [accuracy4, parameters4, average_time_per_epoch4],
    "1Bi-LSTM": [accuracy5, parameters5, average_time_per_epoch5],
    "2Bi-LSTM": [accuracy6, parameters6, average_time_per_epoch6],
}

index = ["Accuracy (%)", "Parameters", "Time cost per epoch (s)"]

df = pd.DataFrame(data, index=index)

df = df.applymap(format_values)

df = df.style \
    .set_properties(**{'font-weight': 'bold', 'border': '2px solid black'}) \
    .set_table_styles([dict(selector='th', props=[('font-weight', 'bold'), ('border', '1px solid black')])])


display(df)

Unnamed: 0,1RNN,1-Bi-RNN,2-Bi-RNN,1LSTM,1Bi-LSTM,2Bi-LSTM
Accuracy (%),0.708,0.703,0.707,0.717,0.718,0.722
Parameters,2917254.0,2928006.0,2952838.0,2949126.0,2991750.0,3091078.0
Time cost per epoch (s),7.625,7.263,7.148,7.995,7.575,8.014
