In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import time
from torch.utils.data import DataLoader
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data.dataset import random_split
from torch import nn
from torch.nn import functional as F
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np


This file is basically the same as the Question3_a-b with the only difference being the MAX_WORDS which is set to 50 in all the models respectively:

This is the main block of an 1-direction RNN classifier.This code was provided for all the other blocks to be based on:

In [None]:
# -*- coding: utf-8 -*-
"""

A 1-direction RNN classifier applied to AG_NEWS dataset

Download dataset:
https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset

"""


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# HYPER-PARAMETERS
MAX_WORDS = 50
EPOCHS = 15
LEARNING_RATE = 1e-3
BATCH_SIZE = 1024
EMBEDDING_DIM = 100
HIDDEN_DIM = 64

######################################################################
# Read dataset files 
# ------------------


train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

######################################################################
# Data processing 
# -----------------------------


tokenizer = get_tokenizer("basic_english")

# All texts are truncated and padded to MAX_WORDS tokens
def collate_batch(batch):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y) - 1 # Target names in range [0,1,2,3] instead of [1,2,3,4]
    X = [vocab(tokenizer(text)) for text in X]
    # Bringing all samples to MAX_WORDS length. Shorter texts are padded with <PAD> sequences, longer texts are truncated.
    X = [tokens+([vocab['<PAD>']]* (MAX_WORDS-len(tokens))) if len(tokens)<MAX_WORDS else tokens[:MAX_WORDS] for tokens in X]
    return torch.tensor(X, dtype=torch.int32).to(device), Y.to(device) 

train_dataset = [(label,train_data['Title'][i] + ' ' + train_data['Description'][i]) for i,label in enumerate(train_data['Class Index'])]
test_dataset = [(label,test_data['Title'][i] + ' ' + test_data['Description'][i]) for i,label in enumerate(test_data['Class Index'])]

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)

target_classes = ["World", "Sports", "Business", "Sci/Tech"]

def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

# Vocabulary includes all tokens with at least 10 occurrences in the texts
# Special tokens <PAD> and <UNK> are used for padding sequences and unknown words respectively
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>","<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

######################################################################
# Define the model
# ----------------


class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, output_dim)

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, hidden = self.rnn(embeddings)
        logits = self.linear(output[:,-1])  # The last output of RNN is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
    
######################################################################
# Initiate an instance of the model
# ---------------------------------


classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
# Define loss function and opimization algorithm
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([param for param in classifier.parameters() if param.requires_grad == True],lr=LEARNING_RATE)

# Count model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('\nModel:')
print(classifier)
print('Total parameters: ',count_parameters(classifier))
print('\n\n')

######################################################################
# Define functions to train and evaluate the model
# ------------------------------------------------


def EvaluateModel(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds1, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds1.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds1 = torch.cat(Y_preds1)
    
    # Returns mean loss, actual labels, predicted labels 
    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds1.detach().cpu().numpy()


def TrainModel(model, loss_fn, optimizer, train_loader, epochs):
    for i in range(1, epochs+1):
        model.train()
        print('Epoch:',i)
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds1 = model(X)

            loss = loss_fn(Y_preds1, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        
start_time = time.time()
TrainModel(classifier, loss_fn, optimizer, train_loader, EPOCHS)
end_time = time.time()
######################################################################
# Evaluate the model with test dataset
# ------------------------------------


_, Y_actual, Y_preds1 = EvaluateModel(classifier, loss_fn, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds1)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds1, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds1))

parameters1 = count_parameters(classifier)
average_time_per_epoch1 = (end_time - start_time)/EPOCHS
accuracy1 = accuracy_score(Y_actual, Y_preds1)



Model:
model(
  (embedding_layer): Embedding(21254, 100)
  (rnn): RNN(100, 64, batch_first=True)
  (linear): Linear(in_features=64, out_features=4, bias=True)
)
Total parameters:  2136284



Epoch: 1


100%|██████████| 118/118 [00:05<00:00, 20.79it/s]


Train Loss : 1.375
Epoch: 2


100%|██████████| 118/118 [00:05<00:00, 21.00it/s]


Train Loss : 1.323
Epoch: 3


100%|██████████| 118/118 [00:04<00:00, 24.13it/s]


Train Loss : 1.323
Epoch: 4


100%|██████████| 118/118 [00:04<00:00, 25.19it/s]


Train Loss : 1.311
Epoch: 5


100%|██████████| 118/118 [00:06<00:00, 19.37it/s]


Train Loss : 1.320
Epoch: 6


100%|██████████| 118/118 [00:04<00:00, 26.58it/s]


Train Loss : 1.291
Epoch: 7


100%|██████████| 118/118 [00:04<00:00, 26.56it/s]


Train Loss : 1.275
Epoch: 8


100%|██████████| 118/118 [00:05<00:00, 19.94it/s]


Train Loss : 1.265
Epoch: 9


100%|██████████| 118/118 [00:04<00:00, 26.15it/s]


Train Loss : 1.271
Epoch: 10


100%|██████████| 118/118 [00:05<00:00, 22.36it/s]


Train Loss : 1.268
Epoch: 11


100%|██████████| 118/118 [00:05<00:00, 22.93it/s]


Train Loss : 1.262
Epoch: 12


100%|██████████| 118/118 [00:04<00:00, 25.58it/s]


Train Loss : 1.247
Epoch: 13


100%|██████████| 118/118 [00:06<00:00, 17.21it/s]


Train Loss : 1.270
Epoch: 14


100%|██████████| 118/118 [00:04<00:00, 25.48it/s]


Train Loss : 1.316
Epoch: 15


100%|██████████| 118/118 [00:04<00:00, 25.92it/s]


Train Loss : 1.328

Test Accuracy : 0.377

Classification Report : 
              precision    recall  f1-score   support

       World       0.63      0.47      0.54      1900
      Sports       0.31      0.84      0.45      1900
    Business       0.36      0.10      0.16      1900
    Sci/Tech       0.35      0.10      0.15      1900

    accuracy                           0.38      7600
   macro avg       0.41      0.38      0.32      7600
weighted avg       0.41      0.38      0.32      7600


Confusion Matrix : 
[[ 892  798  113   97]
 [ 107 1599   91  103]
 [ 277 1298  190  135]
 [ 130 1456  133  181]]


This is a bi-direction RNN classifier. The changes in the code were the following in the "model" class:



```
class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True, bidirectional=True) # Change in code bidirectional = True
        self.linear = nn.Linear(hidden_dim * 2, output_dim)  # Change in code - multiply by 2 since it's bidirectional
        self.hidden_dim = hidden_dim  # Change in code - set the hidden_dim attribute

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, _ = self.rnn(embeddings)
        forward_output = output[:, -1, :self.hidden_dim]  # Change in code - last output of the forward RNN
        backward_output = output[:, 0, self.hidden_dim:]  # Change in code - first output of the backward RNN (since it starts from the end)
        concatenated_output = torch.cat((forward_output, backward_output), dim=1) # Change in code - concatenate the output
        logits = self.linear(concatenated_output) # Change in code - The last output of the forward and the first output of the backward is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
```



In [None]:
# -*- coding: utf-8 -*-
"""

A bi-direction RNN classifier applied to AG_NEWS dataset

Download dataset:
https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset

"""


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# HYPER-PARAMETERS
MAX_WORDS = 50
EPOCHS = 15
LEARNING_RATE = 1e-3
BATCH_SIZE = 1024
EMBEDDING_DIM = 100
HIDDEN_DIM = 64

######################################################################
# Read dataset files 
# ------------------


train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

######################################################################
# Data processing 
# -----------------------------


tokenizer = get_tokenizer("basic_english")

# All texts are truncated and padded to MAX_WORDS tokens
def collate_batch(batch):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y) - 1 # Target names in range [0,1,2,3] instead of [1,2,3,4]
    X = [vocab(tokenizer(text)) for text in X]
    # Bringing all samples to MAX_WORDS length. Shorter texts are padded with <PAD> sequences, longer texts are truncated.
    X = [tokens+([vocab['<PAD>']]* (MAX_WORDS-len(tokens))) if len(tokens)<MAX_WORDS else tokens[:MAX_WORDS] for tokens in X]
    return torch.tensor(X, dtype=torch.int32).to(device), Y.to(device) 

train_dataset = [(label,train_data['Title'][i] + ' ' + train_data['Description'][i]) for i,label in enumerate(train_data['Class Index'])]
test_dataset = [(label,test_data['Title'][i] + ' ' + test_data['Description'][i]) for i,label in enumerate(test_data['Class Index'])]

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)

target_classes = ["World", "Sports", "Business", "Sci/Tech"]

def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

# Vocabulary includes all tokens with at least 10 occurrences in the texts
# Special tokens <PAD> and <UNK> are used for padding sequences and unknown words respectively
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>","<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

######################################################################
# Define the model
# ----------------


class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True, bidirectional=True) # Change in code bidirectional = True
        self.linear = nn.Linear(hidden_dim * 2, output_dim)  # Change in code - multiply by 2 since it's bidirectional
        self.hidden_dim = hidden_dim  # Change in code - set the hidden_dim attribute

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, _ = self.rnn(embeddings)
        forward_output = output[:, -1, :self.hidden_dim]  # Change in code - last output of the forward RNN
        backward_output = output[:, 0, self.hidden_dim:]  # Change in code - first output of the backward RNN (since it starts from the end)
        concatenated_output = torch.cat((forward_output, backward_output), dim=1) # Change in code - concatenate the output
        logits = self.linear(concatenated_output) # Change in code - The last output of the forward and the first output of the backward is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
    
######################################################################
# Initiate an instance of the model
# ---------------------------------


classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
# Define loss function and opimization algorithm
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([param for param in classifier.parameters() if param.requires_grad == True],lr=LEARNING_RATE)

# Count model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('\nModel:')
print(classifier)
print('Total parameters: ',count_parameters(classifier))
print('\n\n')

######################################################################
# Define functions to train and evaluate the model
# ------------------------------------------------


def EvaluateModel(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds2, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds2.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds2 = torch.cat(Y_preds2)
    
    # Returns mean loss, actual labels, predicted labels 
    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds2.detach().cpu().numpy()


def TrainModel(model, loss_fn, optimizer, train_loader, epochs):
    for i in range(1, epochs+1):
        model.train()
        print('Epoch:',i)
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds2 = model(X)

            loss = loss_fn(Y_preds2, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        
start_time = time.time()
TrainModel(classifier, loss_fn, optimizer, train_loader, EPOCHS)
end_time = time.time()
######################################################################
# Evaluate the model with test dataset
# ------------------------------------


_, Y_actual, Y_preds2 = EvaluateModel(classifier, loss_fn, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds2)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds2, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds2))

parameters2 = count_parameters(classifier)
average_time_per_epoch2 = (end_time - start_time)/EPOCHS
accuracy2 = accuracy_score(Y_actual, Y_preds2)



Model:
model(
  (embedding_layer): Embedding(21254, 100)
  (rnn): RNN(100, 64, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=128, out_features=4, bias=True)
)
Total parameters:  2147164



Epoch: 1


100%|██████████| 118/118 [00:05<00:00, 23.19it/s]


Train Loss : 1.263
Epoch: 2


100%|██████████| 118/118 [00:06<00:00, 18.25it/s]


Train Loss : 1.011
Epoch: 3


100%|██████████| 118/118 [00:05<00:00, 23.34it/s]


Train Loss : 0.934
Epoch: 4


100%|██████████| 118/118 [00:05<00:00, 19.90it/s]


Train Loss : 0.902
Epoch: 5


100%|██████████| 118/118 [00:05<00:00, 20.99it/s]


Train Loss : 0.884
Epoch: 6


100%|██████████| 118/118 [00:05<00:00, 23.54it/s]


Train Loss : 0.868
Epoch: 7


100%|██████████| 118/118 [00:06<00:00, 18.35it/s]


Train Loss : 0.864
Epoch: 8


100%|██████████| 118/118 [00:05<00:00, 23.45it/s]


Train Loss : 0.854
Epoch: 9


100%|██████████| 118/118 [00:06<00:00, 18.57it/s]


Train Loss : 0.846
Epoch: 10


100%|██████████| 118/118 [00:05<00:00, 22.66it/s]


Train Loss : 0.844
Epoch: 11


100%|██████████| 118/118 [00:05<00:00, 23.26it/s]


Train Loss : 0.836
Epoch: 12


100%|██████████| 118/118 [00:06<00:00, 18.56it/s]


Train Loss : 0.835
Epoch: 13


100%|██████████| 118/118 [00:04<00:00, 23.95it/s]


Train Loss : 0.837
Epoch: 14


100%|██████████| 118/118 [00:06<00:00, 18.10it/s]


Train Loss : 0.830
Epoch: 15


100%|██████████| 118/118 [00:05<00:00, 23.15it/s]


Train Loss : 0.827

Test Accuracy : 0.882

Classification Report : 
              precision    recall  f1-score   support

       World       0.89      0.89      0.89      1900
      Sports       0.94      0.94      0.94      1900
    Business       0.85      0.83      0.84      1900
    Sci/Tech       0.85      0.86      0.86      1900

    accuracy                           0.88      7600
   macro avg       0.88      0.88      0.88      7600
weighted avg       0.88      0.88      0.88      7600


Confusion Matrix : 
[[1690   65   76   69]
 [  28 1795   49   28]
 [ 108   26 1573  193]
 [  81   28  149 1642]]


This is a bi-direction RNN classifier with 2 layers.The changes in the code were the following in the "model" class:


```
class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=2, batch_first=True, bidirectional=True)  # Change in code bidirectional = True, num_layers = 2
        self.linear = nn.Linear(hidden_dim * 2, output_dim)  # Change in code - multiply by 2 since it's bidirectional
        self.hidden_dim = hidden_dim  # Change in code - set the hidden_dim attribute

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, _ = self.rnn(embeddings)
        forward_output = output[:, -1, :self.hidden_dim]  # Change in code - last output of the forward RNN
        backward_output = output[:, 0, self.hidden_dim:]  # Change in code - first output of the backward RNN (since it starts from the end)
        concatenated_output = torch.cat((forward_output, backward_output), dim=1) # Change in code - concatenate the output
        logits = self.linear(concatenated_output) # The last output of the forward and the first output of the backward is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
```



In [None]:
# -*- coding: utf-8 -*-
"""

A bi-direction RNN classifier with 2 layers applied to AG_NEWS dataset

Download dataset:
https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset

"""

# HYPER-PARAMETERS
MAX_WORDS = 50
EPOCHS = 15
LEARNING_RATE = 1e-3
BATCH_SIZE = 1024
EMBEDDING_DIM = 100
HIDDEN_DIM = 64

######################################################################
# Read dataset files 
# ------------------


train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

######################################################################
# Data processing 
# -----------------------------


tokenizer = get_tokenizer("basic_english")

# All texts are truncated and padded to MAX_WORDS tokens
def collate_batch(batch):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y) - 1 # Target names in range [0,1,2,3] instead of [1,2,3,4]
    X = [vocab(tokenizer(text)) for text in X]
    # Bringing all samples to MAX_WORDS length. Shorter texts are padded with <PAD> sequences, longer texts are truncated.
    X = [tokens+([vocab['<PAD>']]* (MAX_WORDS-len(tokens))) if len(tokens)<MAX_WORDS else tokens[:MAX_WORDS] for tokens in X]
    return torch.tensor(X, dtype=torch.int32).to(device), Y.to(device) 

train_dataset = [(label,train_data['Title'][i] + ' ' + train_data['Description'][i]) for i,label in enumerate(train_data['Class Index'])]
test_dataset = [(label,test_data['Title'][i] + ' ' + test_data['Description'][i]) for i,label in enumerate(test_data['Class Index'])]

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)

target_classes = ["World", "Sports", "Business", "Sci/Tech"]

def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

# Vocabulary includes all tokens with at least 10 occurrences in the texts
# Special tokens <PAD> and <UNK> are used for padding sequences and unknown words respectively
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>","<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

######################################################################
# Define the model
# ----------------


class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=2, batch_first=True, bidirectional=True)  # Change in code bidirectional = True, num_layers = 2
        self.linear = nn.Linear(hidden_dim * 2, output_dim)  # Change in code - multiply by 2 since it's bidirectional
        self.hidden_dim = hidden_dim  # Change in code - set the hidden_dim attribute

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, _ = self.rnn(embeddings)
        forward_output = output[:, -1, :self.hidden_dim]  # Change in code - last output of the forward RNN
        backward_output = output[:, 0, self.hidden_dim:]  # Change in code - first output of the backward RNN (since it starts from the end)
        concatenated_output = torch.cat((forward_output, backward_output), dim=1) # Change in code - concatenate the output
        logits = self.linear(concatenated_output) # The last output of the forward and the first output of the backward is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
    
######################################################################
# Initiate an instance of the model
# ---------------------------------


classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
# Define loss function and opimization algorithm
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([param for param in classifier.parameters() if param.requires_grad == True],lr=LEARNING_RATE)

# Count model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('\nModel:')
print(classifier)
print('Total parameters: ',count_parameters(classifier))
print('\n\n')

######################################################################
# Define functions to train and evaluate the model
# ------------------------------------------------


def EvaluateModel(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds3, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds3.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds3 = torch.cat(Y_preds3)
    
    # Returns mean loss, actual labels, predicted labels 
    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds3.detach().cpu().numpy()


def TrainModel(model, loss_fn, optimizer, train_loader, epochs):
    for i in range(1, epochs+1):
        model.train()
        print('Epoch:',i)
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds3 = model(X)

            loss = loss_fn(Y_preds3, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        
start_time = time.time()
TrainModel(classifier, loss_fn, optimizer, train_loader, EPOCHS)
end_time = time.time()
######################################################################
# Evaluate the model with test dataset
# ------------------------------------


_, Y_actual, Y_preds3 = EvaluateModel(classifier, loss_fn, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds3)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds3, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds3))

parameters3 = count_parameters(classifier)
average_time_per_epoch3 = (end_time - start_time)/EPOCHS
accuracy3 = accuracy_score(Y_actual, Y_preds3)




Model:
model(
  (embedding_layer): Embedding(21254, 100)
  (rnn): RNN(100, 64, num_layers=2, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=128, out_features=4, bias=True)
)
Total parameters:  2171996



Epoch: 1


100%|██████████| 118/118 [00:06<00:00, 18.41it/s]


Train Loss : 1.235
Epoch: 2


100%|██████████| 118/118 [00:05<00:00, 19.77it/s]


Train Loss : 1.026
Epoch: 3


100%|██████████| 118/118 [00:06<00:00, 17.12it/s]


Train Loss : 0.959
Epoch: 4


100%|██████████| 118/118 [00:05<00:00, 20.09it/s]


Train Loss : 0.928
Epoch: 5


100%|██████████| 118/118 [00:07<00:00, 16.62it/s]


Train Loss : 0.905
Epoch: 6


100%|██████████| 118/118 [00:05<00:00, 20.26it/s]


Train Loss : 0.891
Epoch: 7


100%|██████████| 118/118 [00:07<00:00, 16.45it/s]


Train Loss : 0.892
Epoch: 8


100%|██████████| 118/118 [00:05<00:00, 20.34it/s]


Train Loss : 0.874
Epoch: 9


100%|██████████| 118/118 [00:06<00:00, 17.01it/s]


Train Loss : 0.869
Epoch: 10


100%|██████████| 118/118 [00:05<00:00, 20.62it/s]


Train Loss : 0.865
Epoch: 11


100%|██████████| 118/118 [00:06<00:00, 16.96it/s]


Train Loss : 0.868
Epoch: 12


100%|██████████| 118/118 [00:05<00:00, 20.17it/s]


Train Loss : 0.880
Epoch: 13


100%|██████████| 118/118 [00:07<00:00, 16.44it/s]


Train Loss : 0.862
Epoch: 14


100%|██████████| 118/118 [00:05<00:00, 20.25it/s]


Train Loss : 0.848
Epoch: 15


100%|██████████| 118/118 [00:06<00:00, 16.86it/s]


Train Loss : 0.846

Test Accuracy : 0.881

Classification Report : 
              precision    recall  f1-score   support

       World       0.88      0.88      0.88      1900
      Sports       0.93      0.96      0.94      1900
    Business       0.86      0.84      0.85      1900
    Sci/Tech       0.86      0.84      0.85      1900

    accuracy                           0.88      7600
   macro avg       0.88      0.88      0.88      7600
weighted avg       0.88      0.88      0.88      7600


Confusion Matrix : 
[[1680   68  100   52]
 [  41 1825   21   13]
 [  73   32 1594  201]
 [ 111   41  148 1600]]


This is a one-direction LSTM classifier.The changes in the code were the following in the "model" class:



```
class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True) # Change in code - RNN to LSTM
        self.linear = nn.Linear(hidden_dim, output_dim)

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, hidden = self.lstm(embeddings)
        logits = self.linear(output[:,-1])  # The last output of LSTM is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
```



In [None]:
# -*- coding: utf-8 -*-
"""

A 1 direction LSTM classifier applied to AG_NEWS dataset

Download dataset:
https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset

"""

import torch
import time 
from torch.utils.data import DataLoader
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data.dataset import random_split
from torch import nn
from torch.nn import functional as F
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# HYPER-PARAMETERS
MAX_WORDS = 50
EPOCHS = 15
LEARNING_RATE = 1e-3
BATCH_SIZE = 1024
EMBEDDING_DIM = 100
HIDDEN_DIM = 64

######################################################################
# Read dataset files 
# ------------------


train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

######################################################################
# Data processing 
# -----------------------------


tokenizer = get_tokenizer("basic_english")

# All texts are truncated and padded to MAX_WORDS tokens
def collate_batch(batch):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y) - 1 # Target names in range [0,1,2,3] instead of [1,2,3,4]
    X = [vocab(tokenizer(text)) for text in X]
    # Bringing all samples to MAX_WORDS length. Shorter texts are padded with <PAD> sequences, longer texts are truncated.
    X = [tokens+([vocab['<PAD>']]* (MAX_WORDS-len(tokens))) if len(tokens)<MAX_WORDS else tokens[:MAX_WORDS] for tokens in X]
    return torch.tensor(X, dtype=torch.int32).to(device), Y.to(device) 

train_dataset = [(label,train_data['Title'][i] + ' ' + train_data['Description'][i]) for i,label in enumerate(train_data['Class Index'])]
test_dataset = [(label,test_data['Title'][i] + ' ' + test_data['Description'][i]) for i,label in enumerate(test_data['Class Index'])]

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)

target_classes = ["World", "Sports", "Business", "Sci/Tech"]

def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

# Vocabulary includes all tokens with at least 10 occurrences in the texts
# Special tokens <PAD> and <UNK> are used for padding sequences and unknown words respectively
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>","<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

######################################################################
# Define the model
# ----------------


class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True) # Change in code - RNN to LSTM
        self.linear = nn.Linear(hidden_dim, output_dim)

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, hidden = self.lstm(embeddings)
        logits = self.linear(output[:,-1])  # The last output of LSTM is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
    
######################################################################
# Initiate an instance of the model
# ---------------------------------


classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
# Define loss function and opimization algorithm
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([param for param in classifier.parameters() if param.requires_grad == True],lr=LEARNING_RATE)

# Count model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('\nModel:')
print(classifier)
print('Total parameters: ',count_parameters(classifier))
print('\n\n')

######################################################################
# Define functions to train and evaluate the model
# ------------------------------------------------


def EvaluateModel(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds4, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds4.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds4 = torch.cat(Y_preds4)
    
    # Returns mean loss, actual labels, predicted labels 
    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds4.detach().cpu().numpy()


def TrainModel(model, loss_fn, optimizer, train_loader, epochs):
    for i in range(1, epochs+1):
        model.train()
        print('Epoch:',i)
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds4 = model(X)

            loss = loss_fn(Y_preds4, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        
start_time = time.time()
TrainModel(classifier, loss_fn, optimizer, train_loader, EPOCHS)
end_time = time.time()
######################################################################
# Evaluate the model with test dataset
# ------------------------------------


_, Y_actual, Y_preds4 = EvaluateModel(classifier, loss_fn, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds4)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds4, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds4))

parameters4 = count_parameters(classifier)
average_time_per_epoch4 = (end_time - start_time)/EPOCHS
accuracy4 = accuracy_score(Y_actual, Y_preds4)



Model:
model(
  (embedding_layer): Embedding(21254, 100)
  (lstm): LSTM(100, 64, batch_first=True)
  (linear): Linear(in_features=64, out_features=4, bias=True)
)
Total parameters:  2168156



Epoch: 1


100%|██████████| 118/118 [00:05<00:00, 20.67it/s]


Train Loss : 1.339
Epoch: 2


100%|██████████| 118/118 [00:06<00:00, 18.10it/s]


Train Loss : 1.072
Epoch: 3


100%|██████████| 118/118 [00:05<00:00, 21.73it/s]


Train Loss : 0.948
Epoch: 4


100%|██████████| 118/118 [00:06<00:00, 17.57it/s]


Train Loss : 0.908
Epoch: 5


100%|██████████| 118/118 [00:05<00:00, 21.18it/s]


Train Loss : 0.889
Epoch: 6


100%|██████████| 118/118 [00:06<00:00, 17.37it/s]


Train Loss : 0.877
Epoch: 7


100%|██████████| 118/118 [00:05<00:00, 21.92it/s]


Train Loss : 0.873
Epoch: 8


100%|██████████| 118/118 [00:06<00:00, 18.43it/s]


Train Loss : 0.861
Epoch: 9


100%|██████████| 118/118 [00:05<00:00, 20.18it/s]


Train Loss : 0.854
Epoch: 10


100%|██████████| 118/118 [00:05<00:00, 21.08it/s]


Train Loss : 0.851
Epoch: 11


100%|██████████| 118/118 [00:06<00:00, 17.70it/s]


Train Loss : 0.848
Epoch: 12


100%|██████████| 118/118 [00:05<00:00, 21.98it/s]


Train Loss : 0.857
Epoch: 13


100%|██████████| 118/118 [00:06<00:00, 17.11it/s]


Train Loss : 0.846
Epoch: 14


100%|██████████| 118/118 [00:05<00:00, 21.94it/s]


Train Loss : 0.835
Epoch: 15


100%|██████████| 118/118 [00:06<00:00, 17.62it/s]


Train Loss : 0.831

Test Accuracy : 0.885

Classification Report : 
              precision    recall  f1-score   support

       World       0.92      0.87      0.89      1900
      Sports       0.93      0.95      0.94      1900
    Business       0.84      0.85      0.85      1900
    Sci/Tech       0.85      0.87      0.86      1900

    accuracy                           0.89      7600
   macro avg       0.89      0.89      0.89      7600
weighted avg       0.89      0.89      0.89      7600


Confusion Matrix : 
[[1653   71  116   60]
 [  33 1806   16   45]
 [  71   24 1614  191]
 [  42   34  171 1653]]


This is a bi-direction LSTM classifier. The changes in the code were the following in the "model" class:



```
class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True, bidirectional=True) # Change in code bidirectional = True, RNN to LSTM
        self.linear = nn.Linear(hidden_dim * 2, output_dim)  # Change in code - multiply by 2 since it's bidirectional
        self.hidden_dim = hidden_dim  # Change in code - set the hidden_dim attribute

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, _ = self.lstm(embeddings) # Change in code - rnn to lstm
        forward_output = output[:, -1, :self.hidden_dim]  # Change in code - last output of the forward LSTM
        backward_output = output[:, 0, self.hidden_dim:]  # Change in code - first output of the backward LSTM (since it starts from the end)
        concatenated_output = torch.cat((forward_output, backward_output), dim=1) # Change in code - concatenate the output
        logits = self.linear(concatenated_output) # Change in code - The last output of the forward and the first output of the backward is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
```



In [None]:
# -*- coding: utf-8 -*-
"""

A bi-direction LSTM classifier applied to AG_NEWS dataset

Download dataset:
https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset

"""

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# HYPER-PARAMETERS
MAX_WORDS = 50
EPOCHS = 15
LEARNING_RATE = 1e-3
BATCH_SIZE = 1024
EMBEDDING_DIM = 100
HIDDEN_DIM = 64

######################################################################
# Read dataset files 
# ------------------


train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

######################################################################
# Data processing 
# -----------------------------


tokenizer = get_tokenizer("basic_english")

# All texts are truncated and padded to MAX_WORDS tokens
def collate_batch(batch):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y) - 1 # Target names in range [0,1,2,3] instead of [1,2,3,4]
    X = [vocab(tokenizer(text)) for text in X]
    # Bringing all samples to MAX_WORDS length. Shorter texts are padded with <PAD> sequences, longer texts are truncated.
    X = [tokens+([vocab['<PAD>']]* (MAX_WORDS-len(tokens))) if len(tokens)<MAX_WORDS else tokens[:MAX_WORDS] for tokens in X]
    return torch.tensor(X, dtype=torch.int32).to(device), Y.to(device) 

train_dataset = [(label,train_data['Title'][i] + ' ' + train_data['Description'][i]) for i,label in enumerate(train_data['Class Index'])]
test_dataset = [(label,test_data['Title'][i] + ' ' + test_data['Description'][i]) for i,label in enumerate(test_data['Class Index'])]

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)

target_classes = ["World", "Sports", "Business", "Sci/Tech"]

def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

# Vocabulary includes all tokens with at least 10 occurrences in the texts
# Special tokens <PAD> and <UNK> are used for padding sequences and unknown words respectively
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>","<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

######################################################################
# Define the model
# ----------------


class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True, bidirectional=True) # Change in code bidirectional = True, RNN to LSTM
        self.linear = nn.Linear(hidden_dim * 2, output_dim)  # Change in code - multiply by 2 since it's bidirectional
        self.hidden_dim = hidden_dim  # Change in code - set the hidden_dim attribute

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, _ = self.lstm(embeddings) # Change in code - rnn to lstm
        forward_output = output[:, -1, :self.hidden_dim]  # Change in code - last output of the forward LSTM
        backward_output = output[:, 0, self.hidden_dim:]  # Change in code - first output of the backward LSTM (since it starts from the end)
        concatenated_output = torch.cat((forward_output, backward_output), dim=1) # Change in code - concatenate the output
        logits = self.linear(concatenated_output) # Change in code - The last output of the forward and the first output of the backward is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
    
######################################################################
# Initiate an instance of the model
# ---------------------------------


classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
# Define loss function and opimization algorithm
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([param for param in classifier.parameters() if param.requires_grad == True],lr=LEARNING_RATE)

# Count model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('\nModel:')
print(classifier)
print('Total parameters: ',count_parameters(classifier))
print('\n\n')

######################################################################
# Define functions to train and evaluate the model
# ------------------------------------------------


def EvaluateModel(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds5, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds5.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds5 = torch.cat(Y_preds5)
    
    # Returns mean loss, actual labels, predicted labels 
    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds5.detach().cpu().numpy()


def TrainModel(model, loss_fn, optimizer, train_loader, epochs):
    for i in range(1, epochs+1):
        model.train()
        print('Epoch:',i)
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds5 = model(X)

            loss = loss_fn(Y_preds5, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        
start_time = time.time()
TrainModel(classifier, loss_fn, optimizer, train_loader, EPOCHS)
end_time = time.time()
######################################################################
# Evaluate the model with test dataset
# ------------------------------------


_, Y_actual, Y_preds5 = EvaluateModel(classifier, loss_fn, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds5)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds5, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds5))


parameters5 = count_parameters(classifier)
average_time_per_epoch5 = (end_time - start_time)/EPOCHS
accuracy5 = accuracy_score(Y_actual, Y_preds5)



Model:
model(
  (embedding_layer): Embedding(21254, 100)
  (lstm): LSTM(100, 64, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=128, out_features=4, bias=True)
)
Total parameters:  2210908



Epoch: 1


100%|██████████| 118/118 [00:07<00:00, 15.24it/s]


Train Loss : 1.220
Epoch: 2


100%|██████████| 118/118 [00:06<00:00, 18.28it/s]


Train Loss : 0.933
Epoch: 3


100%|██████████| 118/118 [00:07<00:00, 15.38it/s]


Train Loss : 0.878
Epoch: 4


100%|██████████| 118/118 [00:06<00:00, 18.68it/s]


Train Loss : 0.855
Epoch: 5


100%|██████████| 118/118 [00:07<00:00, 15.43it/s]


Train Loss : 0.841
Epoch: 6


100%|██████████| 118/118 [00:06<00:00, 18.49it/s]


Train Loss : 0.831
Epoch: 7


100%|██████████| 118/118 [00:07<00:00, 15.28it/s]


Train Loss : 0.824
Epoch: 8


100%|██████████| 118/118 [00:06<00:00, 18.48it/s]


Train Loss : 0.818
Epoch: 9


100%|██████████| 118/118 [00:07<00:00, 15.43it/s]


Train Loss : 0.813
Epoch: 10


100%|██████████| 118/118 [00:06<00:00, 18.28it/s]


Train Loss : 0.811
Epoch: 11


100%|██████████| 118/118 [00:07<00:00, 15.28it/s]


Train Loss : 0.806
Epoch: 12


100%|██████████| 118/118 [00:06<00:00, 18.64it/s]


Train Loss : 0.803
Epoch: 13


100%|██████████| 118/118 [00:07<00:00, 15.60it/s]


Train Loss : 0.800
Epoch: 14


100%|██████████| 118/118 [00:06<00:00, 18.82it/s]


Train Loss : 0.800
Epoch: 15


100%|██████████| 118/118 [00:07<00:00, 15.25it/s]


Train Loss : 0.797

Test Accuracy : 0.899

Classification Report : 
              precision    recall  f1-score   support

       World       0.93      0.89      0.91      1900
      Sports       0.94      0.97      0.95      1900
    Business       0.88      0.84      0.86      1900
    Sci/Tech       0.85      0.89      0.87      1900

    accuracy                           0.90      7600
   macro avg       0.90      0.90      0.90      7600
weighted avg       0.90      0.90      0.90      7600


Confusion Matrix : 
[[1691   61   92   56]
 [  19 1851   12   18]
 [  55   30 1595  220]
 [  49   36  123 1692]]


This is a bi-direction LSTM classifier with 2 layers. The changes in the code were the following in the "model" class:


```
class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=2, batch_first=True, bidirectional=True)  # Change in code bidirectional = True, num_layers = 2, RNN to LSTM
        self.linear = nn.Linear(hidden_dim * 2, output_dim)  # Change in code - multiply by 2 since it's bidirectional
        self.hidden_dim = hidden_dim  # Change in code - set the hidden_dim attribute

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, _ = self.lstm(embeddings) ## Change in code - rnn to lstm
        forward_output = output[:, -1, :self.hidden_dim]  # Change in code - last output of the forward LSTM
        backward_output = output[:, 0, self.hidden_dim:]  # Change in code - first output of the backward LSTM (since it starts from the end)
        concatenated_output = torch.cat((forward_output, backward_output), dim=1) # Change in code - concatenate the output
        logits = self.linear(concatenated_output) # Change in code - The last output of the forward and the first output of the backward is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
```



In [None]:
# -*- coding: utf-8 -*-
"""

A bi-direction LSTM classifier with 2 layers applied to AG_NEWS dataset

Download dataset:
https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset

"""

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# HYPER-PARAMETERS
MAX_WORDS = 50
EPOCHS = 15
LEARNING_RATE = 1e-3
BATCH_SIZE = 1024
EMBEDDING_DIM = 100
HIDDEN_DIM = 64

######################################################################
# Read dataset files 
# ------------------


train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

######################################################################
# Data processing 
# -----------------------------


tokenizer = get_tokenizer("basic_english")

# All texts are truncated and padded to MAX_WORDS tokens
def collate_batch(batch):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y) - 1 # Target names in range [0,1,2,3] instead of [1,2,3,4]
    X = [vocab(tokenizer(text)) for text in X]
    # Bringing all samples to MAX_WORDS length. Shorter texts are padded with <PAD> sequences, longer texts are truncated.
    X = [tokens+([vocab['<PAD>']]* (MAX_WORDS-len(tokens))) if len(tokens)<MAX_WORDS else tokens[:MAX_WORDS] for tokens in X]
    return torch.tensor(X, dtype=torch.int32).to(device), Y.to(device) 

train_dataset = [(label,train_data['Title'][i] + ' ' + train_data['Description'][i]) for i,label in enumerate(train_data['Class Index'])]
test_dataset = [(label,test_data['Title'][i] + ' ' + test_data['Description'][i]) for i,label in enumerate(test_data['Class Index'])]

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)

target_classes = ["World", "Sports", "Business", "Sci/Tech"]

def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

# Vocabulary includes all tokens with at least 10 occurrences in the texts
# Special tokens <PAD> and <UNK> are used for padding sequences and unknown words respectively
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>","<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

######################################################################
# Define the model
# ----------------


class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=2, batch_first=True, bidirectional=True)  # Change in code bidirectional = True, num_layers = 2, RNN to LSTM
        self.linear = nn.Linear(hidden_dim * 2, output_dim)  # Change in code - multiply by 2 since it's bidirectional
        self.hidden_dim = hidden_dim  # Change in code - set the hidden_dim attribute

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, _ = self.lstm(embeddings) ## Change in code - rnn to lstm
        forward_output = output[:, -1, :self.hidden_dim]  # Change in code - last output of the forward LSTM
        backward_output = output[:, 0, self.hidden_dim:]  # Change in code - first output of the backward LSTM (since it starts from the end)
        concatenated_output = torch.cat((forward_output, backward_output), dim=1) # Change in code - concatenate the output
        logits = self.linear(concatenated_output) # Change in code - The last output of the forward and the first output of the backward is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
    
######################################################################
# Initiate an instance of the model
# ---------------------------------


classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
# Define loss function and opimization algorithm
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([param for param in classifier.parameters() if param.requires_grad == True],lr=LEARNING_RATE)

# Count model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('\nModel:')
print(classifier)
print('Total parameters: ',count_parameters(classifier))
print('\n\n')

######################################################################
# Define functions to train and evaluate the model
# ------------------------------------------------


def EvaluateModel(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds6, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds6.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds6 = torch.cat(Y_preds6)
    
    # Returns mean loss, actual labels, predicted labels 
    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds6.detach().cpu().numpy()


def TrainModel(model, loss_fn, optimizer, train_loader, epochs):
    for i in range(1, epochs+1):
        model.train()
        print('Epoch:',i)
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds6 = model(X)

            loss = loss_fn(Y_preds6, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        
start_time = time.time()
TrainModel(classifier, loss_fn, optimizer, train_loader, EPOCHS)
end_time = time.time()

######################################################################
# Evaluate the model with test dataset
# ------------------------------------


_, Y_actual, Y_preds6 = EvaluateModel(classifier, loss_fn, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds6)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds6, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds6))

parameters6 = count_parameters(classifier)
average_time_per_epoch6 = (end_time - start_time)/EPOCHS
accuracy6 = accuracy_score(Y_actual, Y_preds6)



Model:
model(
  (embedding_layer): Embedding(21254, 100)
  (lstm): LSTM(100, 64, num_layers=2, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=128, out_features=4, bias=True)
)
Total parameters:  2310236



Epoch: 1


100%|██████████| 118/118 [00:07<00:00, 14.78it/s]


Train Loss : 1.170
Epoch: 2


100%|██████████| 118/118 [00:06<00:00, 17.48it/s]


Train Loss : 0.924
Epoch: 3


100%|██████████| 118/118 [00:07<00:00, 14.83it/s]


Train Loss : 0.876
Epoch: 4


100%|██████████| 118/118 [00:06<00:00, 17.76it/s]


Train Loss : 0.857
Epoch: 5


100%|██████████| 118/118 [00:07<00:00, 14.85it/s]


Train Loss : 0.843
Epoch: 6


100%|██████████| 118/118 [00:07<00:00, 16.72it/s]


Train Loss : 0.833
Epoch: 7


100%|██████████| 118/118 [00:07<00:00, 15.93it/s]


Train Loss : 0.828
Epoch: 8


100%|██████████| 118/118 [00:07<00:00, 15.41it/s]


Train Loss : 0.822
Epoch: 9


100%|██████████| 118/118 [00:06<00:00, 17.15it/s]


Train Loss : 0.816
Epoch: 10


100%|██████████| 118/118 [00:07<00:00, 14.95it/s]


Train Loss : 0.813
Epoch: 11


100%|██████████| 118/118 [00:06<00:00, 18.24it/s]


Train Loss : 0.809
Epoch: 12


100%|██████████| 118/118 [00:07<00:00, 15.27it/s]


Train Loss : 0.807
Epoch: 13


100%|██████████| 118/118 [00:06<00:00, 18.19it/s]


Train Loss : 0.805
Epoch: 14


100%|██████████| 118/118 [00:07<00:00, 14.83it/s]


Train Loss : 0.805
Epoch: 15


100%|██████████| 118/118 [00:06<00:00, 18.07it/s]


Train Loss : 0.802

Test Accuracy : 0.895

Classification Report : 
              precision    recall  f1-score   support

       World       0.93      0.87      0.90      1900
      Sports       0.96      0.94      0.95      1900
    Business       0.87      0.86      0.86      1900
    Sci/Tech       0.84      0.91      0.87      1900

    accuracy                           0.90      7600
   macro avg       0.90      0.90      0.90      7600
weighted avg       0.90      0.90      0.90      7600


Confusion Matrix : 
[[1649   57   99   95]
 [  43 1788   30   39]
 [  47   16 1635  202]
 [  36   11  123 1730]]


The results of text that were missclasified by all models and the most frequent pair of correct category and wrong prediction . The matrix of the accuracy, parameters and time cost per epoch of all the above models is below this block.

In [None]:
import numpy as np

predictions = list(zip(Y_preds1, Y_preds2, Y_preds3, Y_preds4, Y_preds5, Y_preds6))


misclassified_indices = []
for i, (pred1, pred2, pred3, pred4, pred5, pred6) in enumerate(predictions):
    if (pred1 != Y_actual[i]) and (pred2 != Y_actual[i]) and (pred3 != Y_actual[i]) and (pred4 != Y_actual[i]) and (pred5 != Y_actual[i]) and (pred6 != Y_actual[i]):
        misclassified_indices.append(i)


misclassified_counts = {category: 0 for category in np.unique(Y_actual)}


pair_counts = {}

for idx in misclassified_indices:
    correct_category = Y_actual[idx]
    misclassified_counts[correct_category] += 1

    pair = (correct_category, Y_preds1[idx])  

    if pair in pair_counts:
        pair_counts[pair] += 1
    else:
        pair_counts[pair] = 1

label_to_category = {
    "0": "0:World",
    "1": "1:Sports",
    "2": "2:Business",
    "3": "3:Sci/Tech",
}


misclassified_index = misclassified_indices[0]


misclassified_text = test_dataset[misclassified_index][1]


correct_category = Y_actual[misclassified_index]
predicted_category = Y_preds1[misclassified_index]


print(f"Misclassified Text (Index: {misclassified_index}):")
print(misclassified_text)
print(f"\nShould have been classified as: {label_to_category[str(correct_category)]}")
print(f"\nWas classified as: {label_to_category[str(predicted_category)]}\n")

for category in sorted(misclassified_counts.keys()):
    category_name = label_to_category[str(category)]
    count = misclassified_counts[category]
    print(f"{category_name}: {count} samples")


most_frequent_pair = max(pair_counts, key=pair_counts.get)
print("\nThe most frequent pair of correct category and wrong prediction:")
print(f"Correct category: {label_to_category[str(most_frequent_pair[0])]}, Wrong prediction: {label_to_category[str(most_frequent_pair[1])]}, Occurrences: {pair_counts[most_frequent_pair]}")



Misclassified Text (Index: 9):
Card fraud unit nets 36,000 cards In its first two years, the UK's dedicated card fraud unit, has recovered 36,000 stolen cards and 171 arrests - and estimates it saved 65m.

Should have been classified as: 3:Sci/Tech

Was classified as: 1:Sports

0:World: 94 samples
1:Sports: 1 samples
2:Business: 107 samples
3:Sci/Tech: 66 samples

The most frequent pair of correct category and wrong prediction:
Correct category: 2:Business, Wrong prediction: 1:Sports, Occurrences: 85


The matrix of the accuracy, parameters and time cost per epoch of all the above models with MAX_WORDS set to 50. As we can see in the results there is a minor increase in accuracy for all models except from the 1 directional RNN which has its accuracy severely decreased( 0.868 to 0.377 ). The increase in accuracy is most significant in the the bi-directional LSTM with 1 layer and the highest accuracy is also achieved in the bi-directional LSTM with 1 layer( 0.890 to 0.899 ). There is a significant increase in time ~0.5-1.2 sec per epoch for each model from the 25 MAX_WORDS counterpart. Like before, the time cost per epoch increases as we progress from simpler to more complex architectures.

In [None]:
import pandas as pd

def format_values(value):
    if isinstance(value, float):
        if value.is_integer():
            return f"{int(value):,}"
        else:
            return f"{value:.3f}"
    return value



data = {
    "1RNN": [accuracy1, parameters1, average_time_per_epoch1],
    "1-Bi-RNN": [accuracy2, parameters2, average_time_per_epoch2],
    "2-Bi-RNN": [accuracy3, parameters3, average_time_per_epoch3],
    "1LSTM": [accuracy4, parameters4, average_time_per_epoch4],
    "1Bi-LSTM": [accuracy5, parameters5, average_time_per_epoch5],
    "2Bi-LSTM": [accuracy6, parameters6, average_time_per_epoch6],
}

index = ["Accuracy (%)", "Parameters", "Time cost per epoch (s)"]

df = pd.DataFrame(data, index=index)

df = df.applymap(format_values)

df = df.style \
    .set_properties(**{'font-weight': 'bold', 'border': '2px solid black'}) \
    .set_table_styles([dict(selector='th', props=[('font-weight', 'bold'), ('border', '1px solid black')])])


display(df)

Unnamed: 0,1RNN,1-Bi-RNN,2-Bi-RNN,1LSTM,1Bi-LSTM,2Bi-LSTM
Accuracy (%),0.377,0.882,0.881,0.885,0.899,0.895
Parameters,2136284.0,2147164.0,2171996.0,2168156.0,2210908.0,2310236.0
Time cost per epoch (s),5.167,5.621,6.445,6.075,7.08,7.299
