This is the file whose main focus is to use the pre-trained GloVe embeddings without freezing them.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
import time
from torch.utils.data import DataLoader
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data.dataset import random_split
from torch import nn
from torch.nn import functional as F
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np
from torchtext.vocab import GloVe


The initialization of the embedding layer using pre-trained GloVe embeddings for the words in the vocabulary.  For words not found in the GloVe embeddings, it assigns them random embeddings within a specified range. The function also prints the number of words for which pre-trained embeddings were found out of the total words in the vocabulary. From the previous files, there is only the bellow differences. If one wants to see the differences in the construction of the different models, one should consult the previous files. 

In [3]:
def initialize_embedding_layer(embedding_layer, glove_embeddings, vocab):
    pretrained_weights = torch.zeros((len(vocab), EMBEDDING_DIM))
    found_words = 0
    for idx, word in enumerate(vocab.get_itos()):
        if word in glove_embeddings.stoi:
            pretrained_weights[idx] = glove_embeddings[word]
            found_words += 1
        else:
            pretrained_weights[idx] = torch.FloatTensor(EMBEDDING_DIM).uniform_(-0.6, 0.6)

    embedding_layer.weight.data.copy_(pretrained_weights)
    print(f'Found {found_words} words with pre-trained embeddings out of {len(vocab)} total words.')


In every model the following changes were made:



```

.
.
.
glove = GloVe(name='6B', dim=EMBEDDING_DIM) # Change in code - Creates a glove object using the GloVe pre-trained word embeddings.
.
.
.
classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
initialize_embedding_layer(classifier.embedding_layer, glove, vocab) # Change in code - Call of the initialize_embedding_layer function withe the glove object.
.
.
.

```



In [4]:
# -*- coding: utf-8 -*-
"""

A 1-direction RNN classifier applied to AG_NEWS dataset

Download dataset:
https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset

"""


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# HYPER-PARAMETERS
MAX_WORDS = 25
EPOCHS = 15
LEARNING_RATE = 1e-3
BATCH_SIZE = 1024
EMBEDDING_DIM = 100
HIDDEN_DIM = 64

######################################################################
# Read dataset files 
# ------------------


train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

######################################################################
# Data processing 
# -----------------------------

glove = GloVe(name='6B', dim=EMBEDDING_DIM)
tokenizer = get_tokenizer("basic_english")

# All texts are truncated and padded to MAX_WORDS tokens
def collate_batch(batch):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y) - 1 # Target names in range [0,1,2,3] instead of [1,2,3,4]
    X = [vocab(tokenizer(text)) for text in X]
    # Bringing all samples to MAX_WORDS length. Shorter texts are padded with <PAD> sequences, longer texts are truncated.
    X = [tokens+([vocab['<PAD>']]* (MAX_WORDS-len(tokens))) if len(tokens)<MAX_WORDS else tokens[:MAX_WORDS] for tokens in X]
    return torch.tensor(X, dtype=torch.int32).to(device), Y.to(device) 

train_dataset = [(label,train_data['Title'][i] + ' ' + train_data['Description'][i]) for i,label in enumerate(train_data['Class Index'])]
test_dataset = [(label,test_data['Title'][i] + ' ' + test_data['Description'][i]) for i,label in enumerate(test_data['Class Index'])]

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)

target_classes = ["World", "Sports", "Business", "Sci/Tech"]

def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

# Vocabulary includes all tokens with at least 10 occurrences in the texts
# Special tokens <PAD> and <UNK> are used for padding sequences and unknown words respectively
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>","<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

######################################################################
# Define the model
# ----------------


class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, output_dim)

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, hidden = self.rnn(embeddings)
        logits = self.linear(output[:,-1])  # The last output of RNN is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
    
######################################################################
# Initiate an instance of the model
# ---------------------------------


classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
initialize_embedding_layer(classifier.embedding_layer, glove, vocab)
# Define loss function and opimization algorithm
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([param for param in classifier.parameters() if param.requires_grad == True],lr=LEARNING_RATE)

# Count model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('\nModel:')
print(classifier)
print('Total parameters: ',count_parameters(classifier))
print('\n\n')

######################################################################
# Define functions to train and evaluate the model
# ------------------------------------------------


def EvaluateModel(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds1, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds1.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds1 = torch.cat(Y_preds1)
    
    # Returns mean loss, actual labels, predicted labels 
    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds1.detach().cpu().numpy()


def TrainModel(model, loss_fn, optimizer, train_loader, epochs):
    for i in range(1, epochs+1):
        model.train()
        print('Epoch:',i)
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds1 = model(X)

            loss = loss_fn(Y_preds1, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        
start_time = time.time()
TrainModel(classifier, loss_fn, optimizer, train_loader, EPOCHS)
end_time = time.time()
######################################################################
# Evaluate the model with test dataset
# ------------------------------------


_, Y_actual, Y_preds1 = EvaluateModel(classifier, loss_fn, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds1)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds1, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds1))

parameters1 = count_parameters(classifier)
average_time_per_epoch1 = (end_time - start_time)/EPOCHS
accuracy1 = accuracy_score(Y_actual, Y_preds1)


.vector_cache/glove.6B.zip: 862MB [02:41, 5.33MB/s]                           
100%|█████████▉| 399999/400000 [00:18<00:00, 21607.37it/s]


Found 20435 words with pre-trained embeddings out of 21254 total words.

Model:
model(
  (embedding_layer): Embedding(21254, 100)
  (rnn): RNN(100, 64, batch_first=True)
  (linear): Linear(in_features=64, out_features=4, bias=True)
)
Total parameters:  2136284



Epoch: 1


100%|██████████| 118/118 [00:06<00:00, 17.39it/s]


Train Loss : 1.033
Epoch: 2


100%|██████████| 118/118 [00:04<00:00, 27.62it/s]


Train Loss : 0.883
Epoch: 3


100%|██████████| 118/118 [00:05<00:00, 22.80it/s]


Train Loss : 0.867
Epoch: 4


100%|██████████| 118/118 [00:05<00:00, 22.70it/s]


Train Loss : 0.856
Epoch: 5


100%|██████████| 118/118 [00:04<00:00, 27.59it/s]


Train Loss : 0.850
Epoch: 6


100%|██████████| 118/118 [00:06<00:00, 19.60it/s]


Train Loss : 0.846
Epoch: 7


100%|██████████| 118/118 [00:04<00:00, 26.80it/s]


Train Loss : 0.844
Epoch: 8


100%|██████████| 118/118 [00:04<00:00, 27.12it/s]


Train Loss : 0.839
Epoch: 9


100%|██████████| 118/118 [00:05<00:00, 20.08it/s]


Train Loss : 0.837
Epoch: 10


100%|██████████| 118/118 [00:04<00:00, 27.59it/s]


Train Loss : 0.838
Epoch: 11


100%|██████████| 118/118 [00:04<00:00, 27.27it/s]


Train Loss : 0.837
Epoch: 12


100%|██████████| 118/118 [00:05<00:00, 20.12it/s]


Train Loss : 0.832
Epoch: 13


100%|██████████| 118/118 [00:04<00:00, 27.13it/s]


Train Loss : 0.833
Epoch: 14


100%|██████████| 118/118 [00:04<00:00, 25.01it/s]


Train Loss : 0.835
Epoch: 15


100%|██████████| 118/118 [00:05<00:00, 21.63it/s]


Train Loss : 0.828

Test Accuracy : 0.891

Classification Report : 
              precision    recall  f1-score   support

       World       0.92      0.89      0.90      1900
      Sports       0.94      0.97      0.96      1900
    Business       0.89      0.80      0.84      1900
    Sci/Tech       0.82      0.91      0.86      1900

    accuracy                           0.89      7600
   macro avg       0.89      0.89      0.89      7600
weighted avg       0.89      0.89      0.89      7600


Confusion Matrix : 
[[1685   67   79   69]
 [  14 1849    9   28]
 [  73   33 1516  278]
 [  63   19   93 1725]]


In [5]:
# -*- coding: utf-8 -*-
"""

A bi-direction RNN classifier applied to AG_NEWS dataset

Download dataset:
https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset

"""


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# HYPER-PARAMETERS
MAX_WORDS = 25
EPOCHS = 15
LEARNING_RATE = 1e-3
BATCH_SIZE = 1024
EMBEDDING_DIM = 100
HIDDEN_DIM = 64

######################################################################
# Read dataset files 
# ------------------


train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

######################################################################
# Data processing 
# -----------------------------

glove = GloVe(name='6B', dim=EMBEDDING_DIM)
tokenizer = get_tokenizer("basic_english")

# All texts are truncated and padded to MAX_WORDS tokens
def collate_batch(batch):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y) - 1 # Target names in range [0,1,2,3] instead of [1,2,3,4]
    X = [vocab(tokenizer(text)) for text in X]
    # Bringing all samples to MAX_WORDS length. Shorter texts are padded with <PAD> sequences, longer texts are truncated.
    X = [tokens+([vocab['<PAD>']]* (MAX_WORDS-len(tokens))) if len(tokens)<MAX_WORDS else tokens[:MAX_WORDS] for tokens in X]
    return torch.tensor(X, dtype=torch.int32).to(device), Y.to(device) 

train_dataset = [(label,train_data['Title'][i] + ' ' + train_data['Description'][i]) for i,label in enumerate(train_data['Class Index'])]
test_dataset = [(label,test_data['Title'][i] + ' ' + test_data['Description'][i]) for i,label in enumerate(test_data['Class Index'])]

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)

target_classes = ["World", "Sports", "Business", "Sci/Tech"]

def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

# Vocabulary includes all tokens with at least 10 occurrences in the texts
# Special tokens <PAD> and <UNK> are used for padding sequences and unknown words respectively
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>","<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

######################################################################
# Define the model
# ----------------


class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True, bidirectional=True) # Change in code bidirectional = True
        self.linear = nn.Linear(hidden_dim * 2, output_dim)  # Change in code - multiply by 2 since it's bidirectional
        self.hidden_dim = hidden_dim  # Change in code - set the hidden_dim attribute

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, _ = self.rnn(embeddings)
        forward_output = output[:, -1, :self.hidden_dim]  # Change in code - last output of the forward RNN
        backward_output = output[:, 0, self.hidden_dim:]  # Change in code - first output of the backward RNN (since it starts from the end)
        concatenated_output = torch.cat((forward_output, backward_output), dim=1) # Change in code - concatenate the output
        logits = self.linear(concatenated_output) # Change in code - The last output of the forward and the first output of the backward is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
    
######################################################################
# Initiate an instance of the model
# ---------------------------------


classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
initialize_embedding_layer(classifier.embedding_layer, glove, vocab)
# Define loss function and opimization algorithm
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([param for param in classifier.parameters() if param.requires_grad == True],lr=LEARNING_RATE)

# Count model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('\nModel:')
print(classifier)
print('Total parameters: ',count_parameters(classifier))
print('\n\n')

######################################################################
# Define functions to train and evaluate the model
# ------------------------------------------------


def EvaluateModel(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds2, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds2.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds2 = torch.cat(Y_preds2)
    
    # Returns mean loss, actual labels, predicted labels 
    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds2.detach().cpu().numpy()


def TrainModel(model, loss_fn, optimizer, train_loader, epochs):
    for i in range(1, epochs+1):
        model.train()
        print('Epoch:',i)
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds2 = model(X)

            loss = loss_fn(Y_preds2, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        
start_time = time.time()
TrainModel(classifier, loss_fn, optimizer, train_loader, EPOCHS)
end_time = time.time()
######################################################################
# Evaluate the model with test dataset
# ------------------------------------


_, Y_actual, Y_preds2 = EvaluateModel(classifier, loss_fn, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds2)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds2, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds2))

parameters2 = count_parameters(classifier)
average_time_per_epoch2 = (end_time - start_time)/EPOCHS
accuracy2 = accuracy_score(Y_actual, Y_preds2)


Found 20435 words with pre-trained embeddings out of 21254 total words.

Model:
model(
  (embedding_layer): Embedding(21254, 100)
  (rnn): RNN(100, 64, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=128, out_features=4, bias=True)
)
Total parameters:  2147164



Epoch: 1


100%|██████████| 118/118 [00:06<00:00, 19.24it/s]


Train Loss : 0.990
Epoch: 2


100%|██████████| 118/118 [00:04<00:00, 25.64it/s]


Train Loss : 0.858
Epoch: 3


100%|██████████| 118/118 [00:05<00:00, 23.10it/s]


Train Loss : 0.843
Epoch: 4


100%|██████████| 118/118 [00:05<00:00, 19.89it/s]


Train Loss : 0.834
Epoch: 5


100%|██████████| 118/118 [00:04<00:00, 25.20it/s]


Train Loss : 0.827
Epoch: 6


100%|██████████| 118/118 [00:06<00:00, 19.12it/s]


Train Loss : 0.821
Epoch: 7


100%|██████████| 118/118 [00:04<00:00, 24.78it/s]


Train Loss : 0.817
Epoch: 8


100%|██████████| 118/118 [00:04<00:00, 25.29it/s]


Train Loss : 0.812
Epoch: 9


100%|██████████| 118/118 [00:06<00:00, 19.42it/s]


Train Loss : 0.809
Epoch: 10


100%|██████████| 118/118 [00:04<00:00, 25.54it/s]


Train Loss : 0.806
Epoch: 11


100%|██████████| 118/118 [00:05<00:00, 21.64it/s]


Train Loss : 0.804
Epoch: 12


100%|██████████| 118/118 [00:05<00:00, 22.03it/s]


Train Loss : 0.803
Epoch: 13


100%|██████████| 118/118 [00:04<00:00, 26.09it/s]


Train Loss : 0.801
Epoch: 14


100%|██████████| 118/118 [00:06<00:00, 19.30it/s]


Train Loss : 0.800
Epoch: 15


100%|██████████| 118/118 [00:04<00:00, 25.38it/s]


Train Loss : 0.798

Test Accuracy : 0.904

Classification Report : 
              precision    recall  f1-score   support

       World       0.89      0.91      0.90      1900
      Sports       0.95      0.97      0.96      1900
    Business       0.88      0.86      0.87      1900
    Sci/Tech       0.89      0.88      0.88      1900

    accuracy                           0.90      7600
   macro avg       0.90      0.90      0.90      7600
weighted avg       0.90      0.90      0.90      7600


Confusion Matrix : 
[[1730   53   77   40]
 [  26 1844   15   15]
 [  89   24 1627  160]
 [  88   19  122 1671]]


In [6]:
# -*- coding: utf-8 -*-
"""

A bi-direction RNN classifier with 2 layers applied to AG_NEWS dataset

Download dataset:
https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset

"""

# HYPER-PARAMETERS
MAX_WORDS = 25
EPOCHS = 15
LEARNING_RATE = 1e-3
BATCH_SIZE = 1024
EMBEDDING_DIM = 100
HIDDEN_DIM = 64

######################################################################
# Read dataset files 
# ------------------


train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

######################################################################
# Data processing 
# -----------------------------

glove = GloVe(name='6B', dim=EMBEDDING_DIM)
tokenizer = get_tokenizer("basic_english")

# All texts are truncated and padded to MAX_WORDS tokens
def collate_batch(batch):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y) - 1 # Target names in range [0,1,2,3] instead of [1,2,3,4]
    X = [vocab(tokenizer(text)) for text in X]
    # Bringing all samples to MAX_WORDS length. Shorter texts are padded with <PAD> sequences, longer texts are truncated.
    X = [tokens+([vocab['<PAD>']]* (MAX_WORDS-len(tokens))) if len(tokens)<MAX_WORDS else tokens[:MAX_WORDS] for tokens in X]
    return torch.tensor(X, dtype=torch.int32).to(device), Y.to(device) 

train_dataset = [(label,train_data['Title'][i] + ' ' + train_data['Description'][i]) for i,label in enumerate(train_data['Class Index'])]
test_dataset = [(label,test_data['Title'][i] + ' ' + test_data['Description'][i]) for i,label in enumerate(test_data['Class Index'])]

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)

target_classes = ["World", "Sports", "Business", "Sci/Tech"]

def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

# Vocabulary includes all tokens with at least 10 occurrences in the texts
# Special tokens <PAD> and <UNK> are used for padding sequences and unknown words respectively
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>","<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

######################################################################
# Define the model
# ----------------


class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=2, batch_first=True, bidirectional=True)  # Change in code bidirectional = True, num_layers = 2
        self.linear = nn.Linear(hidden_dim * 2, output_dim)  # Change in code - multiply by 2 since it's bidirectional
        self.hidden_dim = hidden_dim  # Change in code - set the hidden_dim attribute

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, _ = self.rnn(embeddings)
        forward_output = output[:, -1, :self.hidden_dim]  # Change in code - last output of the forward RNN
        backward_output = output[:, 0, self.hidden_dim:]  # Change in code - first output of the backward RNN (since it starts from the end)
        concatenated_output = torch.cat((forward_output, backward_output), dim=1) # Change in code - concatenate the output
        logits = self.linear(concatenated_output) # The last output of the forward and the first output of the backward is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
    
######################################################################
# Initiate an instance of the model
# ---------------------------------


classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
initialize_embedding_layer(classifier.embedding_layer, glove, vocab)
# Define loss function and opimization algorithm
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([param for param in classifier.parameters() if param.requires_grad == True],lr=LEARNING_RATE)

# Count model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('\nModel:')
print(classifier)
print('Total parameters: ',count_parameters(classifier))
print('\n\n')

######################################################################
# Define functions to train and evaluate the model
# ------------------------------------------------


def EvaluateModel(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds3, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds3.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds3 = torch.cat(Y_preds3)
    
    # Returns mean loss, actual labels, predicted labels 
    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds3.detach().cpu().numpy()


def TrainModel(model, loss_fn, optimizer, train_loader, epochs):
    for i in range(1, epochs+1):
        model.train()
        print('Epoch:',i)
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds3 = model(X)

            loss = loss_fn(Y_preds3, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        
start_time = time.time()
TrainModel(classifier, loss_fn, optimizer, train_loader, EPOCHS)
end_time = time.time()
######################################################################
# Evaluate the model with test dataset
# ------------------------------------


_, Y_actual, Y_preds3 = EvaluateModel(classifier, loss_fn, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds3)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds3, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds3))

parameters3 = count_parameters(classifier)
average_time_per_epoch3 = (end_time - start_time)/EPOCHS
accuracy3 = accuracy_score(Y_actual, Y_preds3)



Found 20435 words with pre-trained embeddings out of 21254 total words.

Model:
model(
  (embedding_layer): Embedding(21254, 100)
  (rnn): RNN(100, 64, num_layers=2, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=128, out_features=4, bias=True)
)
Total parameters:  2171996



Epoch: 1


100%|██████████| 118/118 [00:05<00:00, 19.83it/s]


Train Loss : 0.947
Epoch: 2


100%|██████████| 118/118 [00:05<00:00, 23.06it/s]


Train Loss : 0.858
Epoch: 3


100%|██████████| 118/118 [00:06<00:00, 18.00it/s]


Train Loss : 0.844
Epoch: 4


100%|██████████| 118/118 [00:05<00:00, 22.26it/s]


Train Loss : 0.835
Epoch: 5


100%|██████████| 118/118 [00:06<00:00, 18.04it/s]


Train Loss : 0.831
Epoch: 6


100%|██████████| 118/118 [00:05<00:00, 22.74it/s]


Train Loss : 0.826
Epoch: 7


100%|██████████| 118/118 [00:05<00:00, 22.60it/s]


Train Loss : 0.821
Epoch: 8


100%|██████████| 118/118 [00:06<00:00, 18.32it/s]


Train Loss : 0.818
Epoch: 9


100%|██████████| 118/118 [00:05<00:00, 22.85it/s]


Train Loss : 0.815
Epoch: 10


100%|██████████| 118/118 [00:06<00:00, 18.18it/s]


Train Loss : 0.813
Epoch: 11


100%|██████████| 118/118 [00:05<00:00, 23.57it/s]


Train Loss : 0.832
Epoch: 12


100%|██████████| 118/118 [00:05<00:00, 20.85it/s]


Train Loss : 0.830
Epoch: 13


100%|██████████| 118/118 [00:05<00:00, 19.87it/s]


Train Loss : 0.819
Epoch: 14


100%|██████████| 118/118 [00:05<00:00, 22.75it/s]


Train Loss : 0.814
Epoch: 15


100%|██████████| 118/118 [00:06<00:00, 18.02it/s]


Train Loss : 0.813

Test Accuracy : 0.905

Classification Report : 
              precision    recall  f1-score   support

       World       0.92      0.90      0.91      1900
      Sports       0.93      0.98      0.95      1900
    Business       0.87      0.88      0.87      1900
    Sci/Tech       0.90      0.87      0.88      1900

    accuracy                           0.90      7600
   macro avg       0.90      0.90      0.90      7600
weighted avg       0.90      0.90      0.90      7600


Confusion Matrix : 
[[1705   67   84   44]
 [  14 1856   21    9]
 [  69   30 1668  133]
 [  72   34  146 1648]]


In [7]:
# -*- coding: utf-8 -*-
"""

A 1 direction LSTM classifier applied to AG_NEWS dataset

Download dataset:
https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset

"""

import torch
import time 
from torch.utils.data import DataLoader
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data.dataset import random_split
from torch import nn
from torch.nn import functional as F
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# HYPER-PARAMETERS
MAX_WORDS = 25
EPOCHS = 15
LEARNING_RATE = 1e-3
BATCH_SIZE = 1024
EMBEDDING_DIM = 100
HIDDEN_DIM = 64

######################################################################
# Read dataset files 
# ------------------


train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

######################################################################
# Data processing 
# -----------------------------

glove = GloVe(name='6B', dim=EMBEDDING_DIM)
tokenizer = get_tokenizer("basic_english")

# All texts are truncated and padded to MAX_WORDS tokens
def collate_batch(batch):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y) - 1 # Target names in range [0,1,2,3] instead of [1,2,3,4]
    X = [vocab(tokenizer(text)) for text in X]
    # Bringing all samples to MAX_WORDS length. Shorter texts are padded with <PAD> sequences, longer texts are truncated.
    X = [tokens+([vocab['<PAD>']]* (MAX_WORDS-len(tokens))) if len(tokens)<MAX_WORDS else tokens[:MAX_WORDS] for tokens in X]
    return torch.tensor(X, dtype=torch.int32).to(device), Y.to(device) 

train_dataset = [(label,train_data['Title'][i] + ' ' + train_data['Description'][i]) for i,label in enumerate(train_data['Class Index'])]
test_dataset = [(label,test_data['Title'][i] + ' ' + test_data['Description'][i]) for i,label in enumerate(test_data['Class Index'])]

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)

target_classes = ["World", "Sports", "Business", "Sci/Tech"]

def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

# Vocabulary includes all tokens with at least 10 occurrences in the texts
# Special tokens <PAD> and <UNK> are used for padding sequences and unknown words respectively
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>","<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

######################################################################
# Define the model
# ----------------


class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True) # Change in code - RNN to LSTM
        self.linear = nn.Linear(hidden_dim, output_dim)

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, hidden = self.lstm(embeddings)
        logits = self.linear(output[:,-1])  # The last output of RNN is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
    
######################################################################
# Initiate an instance of the model
# ---------------------------------


classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
initialize_embedding_layer(classifier.embedding_layer, glove, vocab)
# Define loss function and opimization algorithm
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([param for param in classifier.parameters() if param.requires_grad == True],lr=LEARNING_RATE)

# Count model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('\nModel:')
print(classifier)
print('Total parameters: ',count_parameters(classifier))
print('\n\n')

######################################################################
# Define functions to train and evaluate the model
# ------------------------------------------------


def EvaluateModel(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds4, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds4.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds4 = torch.cat(Y_preds4)
    
    # Returns mean loss, actual labels, predicted labels 
    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds4.detach().cpu().numpy()


def TrainModel(model, loss_fn, optimizer, train_loader, epochs):
    for i in range(1, epochs+1):
        model.train()
        print('Epoch:',i)
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds4 = model(X)

            loss = loss_fn(Y_preds4, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        
start_time = time.time()
TrainModel(classifier, loss_fn, optimizer, train_loader, EPOCHS)
end_time = time.time()
######################################################################
# Evaluate the model with test dataset
# ------------------------------------


_, Y_actual, Y_preds4 = EvaluateModel(classifier, loss_fn, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds4)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds4, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds4))

parameters4 = count_parameters(classifier)
average_time_per_epoch4 = (end_time - start_time)/EPOCHS
accuracy4 = accuracy_score(Y_actual, Y_preds4)


Found 20435 words with pre-trained embeddings out of 21254 total words.

Model:
model(
  (embedding_layer): Embedding(21254, 100)
  (lstm): LSTM(100, 64, batch_first=True)
  (linear): Linear(in_features=64, out_features=4, bias=True)
)
Total parameters:  2168156



Epoch: 1


100%|██████████| 118/118 [00:06<00:00, 18.24it/s]


Train Loss : 1.033
Epoch: 2


100%|██████████| 118/118 [00:04<00:00, 23.85it/s]


Train Loss : 0.858
Epoch: 3


100%|██████████| 118/118 [00:05<00:00, 21.95it/s]


Train Loss : 0.841
Epoch: 4


100%|██████████| 118/118 [00:05<00:00, 19.82it/s]


Train Loss : 0.831
Epoch: 5


100%|██████████| 118/118 [00:04<00:00, 24.37it/s]


Train Loss : 0.824
Epoch: 6


100%|██████████| 118/118 [00:06<00:00, 18.70it/s]


Train Loss : 0.818
Epoch: 7


100%|██████████| 118/118 [00:04<00:00, 24.57it/s]


Train Loss : 0.813
Epoch: 8


100%|██████████| 118/118 [00:04<00:00, 23.88it/s]


Train Loss : 0.811
Epoch: 9


100%|██████████| 118/118 [00:06<00:00, 18.40it/s]


Train Loss : 0.809
Epoch: 10


100%|██████████| 118/118 [00:04<00:00, 24.40it/s]


Train Loss : 0.806
Epoch: 11


100%|██████████| 118/118 [00:06<00:00, 18.70it/s]


Train Loss : 0.804
Epoch: 12


100%|██████████| 118/118 [00:04<00:00, 24.53it/s]


Train Loss : 0.802
Epoch: 13


100%|██████████| 118/118 [00:04<00:00, 24.12it/s]


Train Loss : 0.800
Epoch: 14


100%|██████████| 118/118 [00:06<00:00, 19.04it/s]


Train Loss : 0.799
Epoch: 15


100%|██████████| 118/118 [00:05<00:00, 23.45it/s]


Train Loss : 0.798

Test Accuracy : 0.908

Classification Report : 
              precision    recall  f1-score   support

       World       0.93      0.90      0.91      1900
      Sports       0.95      0.97      0.96      1900
    Business       0.88      0.86      0.87      1900
    Sci/Tech       0.87      0.90      0.89      1900

    accuracy                           0.91      7600
   macro avg       0.91      0.91      0.91      7600
weighted avg       0.91      0.91      0.91      7600


Confusion Matrix : 
[[1707   62   78   53]
 [  19 1843   15   23]
 [  60   23 1639  178]
 [  47   11  130 1712]]


In [8]:
# -*- coding: utf-8 -*-
"""

A bi-direction LSTM classifier applied to AG_NEWS dataset

Download dataset:
https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset

"""

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# HYPER-PARAMETERS
MAX_WORDS = 25
EPOCHS = 15
LEARNING_RATE = 1e-3
BATCH_SIZE = 1024
EMBEDDING_DIM = 100
HIDDEN_DIM = 64

######################################################################
# Read dataset files 
# ------------------


train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

######################################################################
# Data processing 
# -----------------------------

glove = GloVe(name='6B', dim=EMBEDDING_DIM)
tokenizer = get_tokenizer("basic_english")

# All texts are truncated and padded to MAX_WORDS tokens
def collate_batch(batch):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y) - 1 # Target names in range [0,1,2,3] instead of [1,2,3,4]
    X = [vocab(tokenizer(text)) for text in X]
    # Bringing all samples to MAX_WORDS length. Shorter texts are padded with <PAD> sequences, longer texts are truncated.
    X = [tokens+([vocab['<PAD>']]* (MAX_WORDS-len(tokens))) if len(tokens)<MAX_WORDS else tokens[:MAX_WORDS] for tokens in X]
    return torch.tensor(X, dtype=torch.int32).to(device), Y.to(device) 

train_dataset = [(label,train_data['Title'][i] + ' ' + train_data['Description'][i]) for i,label in enumerate(train_data['Class Index'])]
test_dataset = [(label,test_data['Title'][i] + ' ' + test_data['Description'][i]) for i,label in enumerate(test_data['Class Index'])]

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)

target_classes = ["World", "Sports", "Business", "Sci/Tech"]

def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

# Vocabulary includes all tokens with at least 10 occurrences in the texts
# Special tokens <PAD> and <UNK> are used for padding sequences and unknown words respectively
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>","<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

######################################################################
# Define the model
# ----------------


class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True, bidirectional=True) # Change in code bidirectional = True, RNN to LSTM
        self.linear = nn.Linear(hidden_dim * 2, output_dim)  # Change in code - multiply by 2 since it's bidirectional
        self.hidden_dim = hidden_dim  # Change in code - set the hidden_dim attribute

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, _ = self.lstm(embeddings) # Change in code - rnn to lstm
        forward_output = output[:, -1, :self.hidden_dim]  # Change in code - last output of the forward LSTM
        backward_output = output[:, 0, self.hidden_dim:]  # Change in code - first output of the backward LSTM (since it starts from the end)
        concatenated_output = torch.cat((forward_output, backward_output), dim=1) # Change in code - concatenate the output
        logits = self.linear(concatenated_output) # Change in code - The last output of the forward and the first output of the backward is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
    
######################################################################
# Initiate an instance of the model
# ---------------------------------


classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
initialize_embedding_layer(classifier.embedding_layer, glove, vocab)
# Define loss function and opimization algorithm
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([param for param in classifier.parameters() if param.requires_grad == True],lr=LEARNING_RATE)

# Count model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('\nModel:')
print(classifier)
print('Total parameters: ',count_parameters(classifier))
print('\n\n')

######################################################################
# Define functions to train and evaluate the model
# ------------------------------------------------


def EvaluateModel(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds5, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds5.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds5 = torch.cat(Y_preds5)
    
    # Returns mean loss, actual labels, predicted labels 
    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds5.detach().cpu().numpy()


def TrainModel(model, loss_fn, optimizer, train_loader, epochs):
    for i in range(1, epochs+1):
        model.train()
        print('Epoch:',i)
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds5 = model(X)

            loss = loss_fn(Y_preds5, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        
start_time = time.time()
TrainModel(classifier, loss_fn, optimizer, train_loader, EPOCHS)
end_time = time.time()
######################################################################
# Evaluate the model with test dataset
# ------------------------------------


_, Y_actual, Y_preds5 = EvaluateModel(classifier, loss_fn, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds5)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds5, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds5))


parameters5 = count_parameters(classifier)
average_time_per_epoch5 = (end_time - start_time)/EPOCHS
accuracy5 = accuracy_score(Y_actual, Y_preds5)


Found 20435 words with pre-trained embeddings out of 21254 total words.

Model:
model(
  (embedding_layer): Embedding(21254, 100)
  (lstm): LSTM(100, 64, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=128, out_features=4, bias=True)
)
Total parameters:  2210908



Epoch: 1


100%|██████████| 118/118 [00:05<00:00, 20.87it/s]


Train Loss : 0.986
Epoch: 2


100%|██████████| 118/118 [00:06<00:00, 16.99it/s]


Train Loss : 0.849
Epoch: 3


100%|██████████| 118/118 [00:05<00:00, 20.26it/s]


Train Loss : 0.835
Epoch: 4


100%|██████████| 118/118 [00:06<00:00, 17.78it/s]


Train Loss : 0.826
Epoch: 5


100%|██████████| 118/118 [00:06<00:00, 19.62it/s]


Train Loss : 0.820
Epoch: 6


100%|██████████| 118/118 [00:06<00:00, 19.40it/s]


Train Loss : 0.813
Epoch: 7


100%|██████████| 118/118 [00:06<00:00, 18.28it/s]


Train Loss : 0.809
Epoch: 8


100%|██████████| 118/118 [00:05<00:00, 20.04it/s]


Train Loss : 0.806
Epoch: 9


100%|██████████| 118/118 [00:06<00:00, 17.07it/s]


Train Loss : 0.803
Epoch: 10


100%|██████████| 118/118 [00:05<00:00, 21.05it/s]


Train Loss : 0.801
Epoch: 11


100%|██████████| 118/118 [00:07<00:00, 16.73it/s]


Train Loss : 0.799
Epoch: 12


100%|██████████| 118/118 [00:05<00:00, 20.80it/s]


Train Loss : 0.797
Epoch: 13


100%|██████████| 118/118 [00:07<00:00, 16.75it/s]


Train Loss : 0.796
Epoch: 14


100%|██████████| 118/118 [00:05<00:00, 21.11it/s]


Train Loss : 0.794
Epoch: 15


100%|██████████| 118/118 [00:06<00:00, 16.86it/s]


Train Loss : 0.794

Test Accuracy : 0.912

Classification Report : 
              precision    recall  f1-score   support

       World       0.93      0.90      0.91      1900
      Sports       0.96      0.98      0.97      1900
    Business       0.89      0.87      0.88      1900
    Sci/Tech       0.88      0.90      0.89      1900

    accuracy                           0.91      7600
   macro avg       0.91      0.91      0.91      7600
weighted avg       0.91      0.91      0.91      7600


Confusion Matrix : 
[[1712   53   79   56]
 [  16 1858   14   12]
 [  63   20 1645  172]
 [  55   13  115 1717]]


In [9]:
# -*- coding: utf-8 -*-
"""

A bi-direction LSTM classifier with 2 layers applied to AG_NEWS dataset

Download dataset:
https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset

"""

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# HYPER-PARAMETERS
MAX_WORDS = 25
EPOCHS = 15
LEARNING_RATE = 1e-3
BATCH_SIZE = 1024
EMBEDDING_DIM = 100
HIDDEN_DIM = 64

######################################################################
# Read dataset files 
# ------------------


train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

######################################################################
# Data processing 
# -----------------------------

glove = GloVe(name='6B', dim=EMBEDDING_DIM)
tokenizer = get_tokenizer("basic_english")

# All texts are truncated and padded to MAX_WORDS tokens
def collate_batch(batch):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y) - 1 # Target names in range [0,1,2,3] instead of [1,2,3,4]
    X = [vocab(tokenizer(text)) for text in X]
    # Bringing all samples to MAX_WORDS length. Shorter texts are padded with <PAD> sequences, longer texts are truncated.
    X = [tokens+([vocab['<PAD>']]* (MAX_WORDS-len(tokens))) if len(tokens)<MAX_WORDS else tokens[:MAX_WORDS] for tokens in X]
    return torch.tensor(X, dtype=torch.int32).to(device), Y.to(device) 

train_dataset = [(label,train_data['Title'][i] + ' ' + train_data['Description'][i]) for i,label in enumerate(train_data['Class Index'])]
test_dataset = [(label,test_data['Title'][i] + ' ' + test_data['Description'][i]) for i,label in enumerate(test_data['Class Index'])]

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)

target_classes = ["World", "Sports", "Business", "Sci/Tech"]

def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

# Vocabulary includes all tokens with at least 10 occurrences in the texts
# Special tokens <PAD> and <UNK> are used for padding sequences and unknown words respectively
vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>","<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

######################################################################
# Define the model
# ----------------


class model(nn.Module):
    def __init__(self,input_dim, embedding_dim, hidden_dim, output_dim):
        super(model, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=2, batch_first=True, bidirectional=True)  # Change in code bidirectional = True, num_layers = 2, RNN to LSTM
        self.linear = nn.Linear(hidden_dim * 2, output_dim)  # Change in code - multiply by 2 since it's bidirectional
        self.hidden_dim = hidden_dim  # Change in code - set the hidden_dim attribute

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, _ = self.lstm(embeddings) ## Change in code - rnn to lstm
        forward_output = output[:, -1, :self.hidden_dim]  # Change in code - last output of the forward LSTM
        backward_output = output[:, 0, self.hidden_dim:]  # Change in code - first output of the backward LSTM (since it starts from the end)
        concatenated_output = torch.cat((forward_output, backward_output), dim=1) # Change in code - concatenate the output
        logits = self.linear(concatenated_output) # Change in code - The last output of the forward and the first output of the backward is used for sequence classification
        probs = F.softmax(logits, dim=1)
        return probs
    
    
######################################################################
# Initiate an instance of the model
# ---------------------------------


classifier = model(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, len(target_classes)).to(device)
initialize_embedding_layer(classifier.embedding_layer, glove, vocab)
# Define loss function and opimization algorithm
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([param for param in classifier.parameters() if param.requires_grad == True],lr=LEARNING_RATE)

# Count model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('\nModel:')
print(classifier)
print('Total parameters: ',count_parameters(classifier))
print('\n\n')

######################################################################
# Define functions to train and evaluate the model
# ------------------------------------------------


def EvaluateModel(model, loss_fn, val_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds6, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds6.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds6 = torch.cat(Y_preds6)
    
    # Returns mean loss, actual labels, predicted labels 
    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds6.detach().cpu().numpy()


def TrainModel(model, loss_fn, optimizer, train_loader, epochs):
    for i in range(1, epochs+1):
        model.train()
        print('Epoch:',i)
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds6 = model(X)

            loss = loss_fn(Y_preds6, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        
start_time = time.time()
TrainModel(classifier, loss_fn, optimizer, train_loader, EPOCHS)
end_time = time.time()

######################################################################
# Evaluate the model with test dataset
# ------------------------------------


_, Y_actual, Y_preds6 = EvaluateModel(classifier, loss_fn, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds6)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds6, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds6))

parameters6 = count_parameters(classifier)
average_time_per_epoch6 = (end_time - start_time)/EPOCHS
accuracy6 = accuracy_score(Y_actual, Y_preds6)


Found 20435 words with pre-trained embeddings out of 21254 total words.

Model:
model(
  (embedding_layer): Embedding(21254, 100)
  (lstm): LSTM(100, 64, num_layers=2, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=128, out_features=4, bias=True)
)
Total parameters:  2310236



Epoch: 1


100%|██████████| 118/118 [00:08<00:00, 14.70it/s]


Train Loss : 0.967
Epoch: 2


100%|██████████| 118/118 [00:06<00:00, 17.90it/s]


Train Loss : 0.853
Epoch: 3


100%|██████████| 118/118 [00:07<00:00, 14.85it/s]


Train Loss : 0.839
Epoch: 4


100%|██████████| 118/118 [00:06<00:00, 17.81it/s]


Train Loss : 0.832
Epoch: 5


100%|██████████| 118/118 [00:07<00:00, 14.93it/s]


Train Loss : 0.829
Epoch: 6


100%|██████████| 118/118 [00:06<00:00, 17.55it/s]


Train Loss : 0.821
Epoch: 7


100%|██████████| 118/118 [00:07<00:00, 14.93it/s]


Train Loss : 0.817
Epoch: 8


100%|██████████| 118/118 [00:06<00:00, 17.76it/s]


Train Loss : 0.816
Epoch: 9


100%|██████████| 118/118 [00:07<00:00, 14.76it/s]


Train Loss : 0.812
Epoch: 10


100%|██████████| 118/118 [00:07<00:00, 16.31it/s]


Train Loss : 0.809
Epoch: 11


100%|██████████| 118/118 [00:07<00:00, 15.80it/s]


Train Loss : 0.807
Epoch: 12


100%|██████████| 118/118 [00:07<00:00, 15.83it/s]


Train Loss : 0.806
Epoch: 13


100%|██████████| 118/118 [00:07<00:00, 16.41it/s]


Train Loss : 0.803
Epoch: 14


100%|██████████| 118/118 [00:07<00:00, 15.02it/s]


Train Loss : 0.802
Epoch: 15


100%|██████████| 118/118 [00:06<00:00, 17.28it/s]


Train Loss : 0.801

Test Accuracy : 0.905

Classification Report : 
              precision    recall  f1-score   support

       World       0.89      0.91      0.90      1900
      Sports       0.96      0.96      0.96      1900
    Business       0.88      0.87      0.87      1900
    Sci/Tech       0.89      0.88      0.88      1900

    accuracy                           0.90      7600
   macro avg       0.90      0.90      0.90      7600
weighted avg       0.90      0.90      0.90      7600


Confusion Matrix : 
[[1726   53   79   42]
 [  42 1825   19   14]
 [  80   13 1650  157]
 [  81    7  136 1676]]


The results of text that were missclasified by all models and the most frequent pair of correct category and wrong prediction . The matrix of the accuracy, parameters and time cost per epoch of all the above models is below this block.

In [10]:
import numpy as np

predictions = list(zip(Y_preds1, Y_preds2, Y_preds3, Y_preds4, Y_preds5, Y_preds6))


misclassified_indices = []
for i, (pred1, pred2, pred3, pred4, pred5, pred6) in enumerate(predictions):
    if (pred1 != Y_actual[i]) and (pred2 != Y_actual[i]) and (pred3 != Y_actual[i]) and (pred4 != Y_actual[i]) and (pred5 != Y_actual[i]) and (pred6 != Y_actual[i]):
        misclassified_indices.append(i)


misclassified_counts = {category: 0 for category in np.unique(Y_actual)}


pair_counts = {}

for idx in misclassified_indices:
    correct_category = Y_actual[idx]
    misclassified_counts[correct_category] += 1

    pair = (correct_category, Y_preds1[idx]) 

    if pair in pair_counts:
        pair_counts[pair] += 1
    else:
        pair_counts[pair] = 1

label_to_category = {
    "0": "0:World",
    "1": "1:Sports",
    "2": "2:Business",
    "3": "3:Sci/Tech",
}


misclassified_index = misclassified_indices[0]


misclassified_text = test_dataset[misclassified_index][1]

correct_category = Y_actual[misclassified_index]
predicted_category = Y_preds1[misclassified_index]

print(f"Misclassified Text (Index: {misclassified_index}):")
print(misclassified_text)
print(f"\nShould have been classified as: {label_to_category[str(correct_category)]}")
print(f"\nWas classified as: {label_to_category[str(predicted_category)]}\n")

for category in sorted(misclassified_counts.keys()):
    category_name = label_to_category[str(category)]
    count = misclassified_counts[category]
    print(f"{category_name}: {count} samples")


most_frequent_pair = max(pair_counts, key=pair_counts.get)
print("\nThe most frequent pair of correct category and wrong prediction:")
print(f"Correct category: {label_to_category[str(most_frequent_pair[0])]}, Wrong prediction: {label_to_category[str(most_frequent_pair[1])]}, Occurrences: {pair_counts[most_frequent_pair]}")



Misclassified Text (Index: 56):
India's Tata expands regional footprint via NatSteel buyout (AFP) AFP - India's Tata Iron and Steel Company Ltd. took a strategic step to expand its Asian footprint with the announcement it will buy the Asia-Pacific steel operations of Singapore's NatSteel Ltd.

Should have been classified as: 0:World

Was classified as: 2:Business

0:World: 132 samples
1:Sports: 21 samples
2:Business: 158 samples
3:Sci/Tech: 101 samples

The most frequent pair of correct category and wrong prediction:
Correct category: 2:Business, Wrong prediction: 3:Sci/Tech, Occurrences: 108


In terms of accuracy, 1Bi-LSTM achieves the highest score at 0.912, closely followed by 2-Bi-RNN at 0.905 and 2Bi-LSTM at 0.905. The 1RNN has the lowest accuracy at 0.891, which is still relatively close to the other models.

When using GloVe embeddings, it is evident that the models without pre-trained embeddings generally achieve lower accuracy. This comparison highlights the benefits of incorporating pre-trained embeddings like GloVe in natural language processing tasks. Pre-trained embeddings capture semantic and syntactic information, which can aid the models in making better predictions and result in higher accuracy.

In [12]:
import pandas as pd

def format_values(value):
    if isinstance(value, float):
        if value.is_integer():
            return f"{int(value):,}"
        else:
            return f"{value:.3f}"
    return value



data = {
    "1RNN": [accuracy1, parameters1, average_time_per_epoch1],
    "1-Bi-RNN": [accuracy2, parameters2, average_time_per_epoch2],
    "2-Bi-RNN": [accuracy3, parameters3, average_time_per_epoch3],
    "1LSTM": [accuracy4, parameters4, average_time_per_epoch4],
    "1Bi-LSTM": [accuracy5, parameters5, average_time_per_epoch5],
    "2Bi-LSTM": [accuracy6, parameters6, average_time_per_epoch6],
}

index = ["Accuracy (%)", "Parameters", "Time cost per epoch (s)"]

df = pd.DataFrame(data, index=index)

df = df.applymap(format_values)

df = df.style \
    .set_properties(**{'font-weight': 'bold', 'border': '2px solid black'}) \
    .set_table_styles([dict(selector='th', props=[('font-weight', 'bold'), ('border', '1px solid black')])])


display(df)

Unnamed: 0,1RNN,1-Bi-RNN,2-Bi-RNN,1LSTM,1Bi-LSTM,2Bi-LSTM
Accuracy (%),0.891,0.904,0.905,0.908,0.912,0.905
Parameters,2136284.0,2147164.0,2171996.0,2168156.0,2210908.0,2310236.0
Time cost per epoch (s),5.033,5.267,5.763,5.485,6.301,7.37
