In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from datasets import load_dataset
from collections import Counter
from tqdm import tqdm
from sklearn.metrics import classification_report
from torchtext.vocab import vocab

In [None]:
class EmotionDataset(Dataset):
    """
    A custom dataset class which preprocess text data and converts it into input
    tensors suitable for the models.
    """
    def __init__(self, texts, labels, tokenizer, vocabulary=None):
        self.tokenizer= tokenizer
        self.tokenized_texts= [self.tokenizer(text) for text in texts]
        self.labels= self._preprocess_labels(labels)

        # Building a vocabulary if not present
        if vocabulary is None:
            counter= Counter(word for tokens in self.tokenized_texts for word in tokens)
            self.vocabulary= vocab(counter, min_freq=1, specials=["<unk>"])
            self.vocabulary.set_default_index(self.vocabulary["<unk>"])
        else:
            self.vocabulary= vocabulary

        #Converting tokenized text into indices based on the vocabulary
        self.numerical_texts= [[self.vocabulary[token] for token in tokens] for tokens in self.tokenized_texts]
        self.inputs= pad_sequence(
            [torch.tensor(seq) for seq in self.numerical_texts],
            batch_first=True,
            padding_value=0
        ) #Padding to ensure uniform length of the input sequence

    def _preprocess_labels(self, labels):
        encoded_labels= np.zeros((len(labels), 28))

        #Setting labels with 1 for the corresponding classes
        for i, label_list in enumerate(labels):
            for label in label_list:
                encoded_labels[i][label]= 1

        return torch.tensor(encoded_labels, dtype=torch.float)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

In [None]:
class CNN(nn.Module):
    """
    A Convolutional Neural Network for the text classification task
    """
    def __init__(self, vocab_size, embed_dim, num_classes, kernel_sizes, num_filters):
        super(CNN, self).__init__()
        self.embedding= nn.Embedding(vocab_size, embed_dim)
        self.convs= nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, embed_dim)) for k in kernel_sizes
        ])
        self.fc= nn.Linear(len(kernel_sizes) * num_filters, num_classes)
        self.dropout= nn.Dropout(0.5)
        self.sigmoid= nn.Sigmoid()

    def forward(self, x):
        x= self.embedding(x)
        x= x.unsqueeze(1)

        #Applying convolutional layer and RELU activation
        conv_outputs= [torch.relu(conv(x)).squeeze(3) for conv in self.convs]

        #Applying max pool across the sequence length and concatenating pooled outputs
        pooled_outputs= [torch.max(output, dim=2).values for output in conv_outputs]
        x= torch.cat(pooled_outputs, dim=1)

        x= self.dropout(x)
        x= self.fc(x)

        return self.sigmoid(x) #Returning sigmoid to get class probabilities


In [None]:
def train_model(model, train_loader, device, num_epochs=5):
  criterion= nn.BCELoss()
  optimizer= optim.Adam(model.parameters(), lr=0.001)

  model.train()
  for epoch in range(num_epochs):
      epoch_loss= 0
      for inputs, labels in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
          inputs, labels = inputs.to(device), labels.to(device)

          optimizer.zero_grad()
          outputs= model(inputs)

          loss= criterion(outputs, labels)
          epoch_loss += loss.item()
          loss.backward()

          optimizer.step()
      print(f"Epoch {epoch+1}, Loss: {epoch_loss/len(train_loader):.4f}")

In [None]:
def evaluate(model, dataloader):
    class_names = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring",
    "confusion", "curiosity", "desire", "disappointment", "disapproval", "disgust",
    "embarrassment", "excitement", "fear", "gratitude", "grief", "joy",
    "love", "nervousness", "optimism", "pride", "realization", "relief",
    "remorse", "sadness", "surprise", "neutral"
    ]
    model.eval()
    all_predictions= []
    all_labels= []

    with torch.no_grad():
        for inputs, labels in tqdm(dataloader, desc="Evaluating"):
            inputs, labels= inputs.to(device), labels.to(device)

            outputs= model(inputs)
            predictions= (outputs > 0.5).int() #Applying threshold on probabilites
            all_predictions.append(predictions.cpu())
            all_labels.append(labels.cpu())


    #Concatenating all batch prediction and labels into single tensors
    all_predictions= torch.cat(all_predictions, dim=0).numpy()
    all_labels= torch.cat(all_labels, dim=0).numpy()

    #Returning a classification report
    return classification_report(all_labels, all_predictions, target_names=class_names, zero_division=0)

In [None]:
#Loading the dataset
dataset= load_dataset("google-research-datasets/go_emotions")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.40k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/350k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})

In [None]:
train_df= pd.DataFrame(dataset["train"])
test_df= pd.DataFrame(dataset["test"])

In [None]:
#Converting labels into integers to prevent problems dring preprocessing
train_df['labels']= train_df['labels'].apply(lambda x: [int(label) for label in x])
test_df['labels']= test_df['labels'].apply(lambda x: [int(label) for label in x])

In [None]:
#Getting a basic english tokenizer
tokenizer= get_tokenizer("basic_english")

In [None]:
#Preprocessing the datasets
train_dataset= EmotionDataset(train_df["text"], train_df["labels"], tokenizer= tokenizer)
test_dataset= EmotionDataset(test_df["text"], test_df["labels"], vocabulary=train_dataset.vocabulary, tokenizer= tokenizer)

In [None]:
#Batching the data
train_dataloader= DataLoader(train_dataset, batch_size=2, shuffle=True)
test_dataloader= DataLoader(test_dataset, batch_size=2, shuffle=False)

In [None]:
#Setting hyperparameters for the model (Achieved best results with these results)
vocab_size= len(train_dataset.vocabulary)
embed_dim= 50
num_classes= 28
kernel_sizes= [3, 4, 5]
num_filters= 100
device= torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs= 5

model= CNN(vocab_size, embed_dim, num_classes, kernel_sizes, num_filters).to(device)

In [None]:
#Training the model
train_model(model, train_dataloader, device)

Epoch 1/5: 100%|██████████| 21705/21705 [01:04<00:00, 336.75it/s]


Epoch 1, Loss: 0.1376


Epoch 2/5: 100%|██████████| 21705/21705 [00:57<00:00, 379.50it/s]


Epoch 2, Loss: 0.1264


Epoch 3/5: 100%|██████████| 21705/21705 [00:57<00:00, 377.38it/s]


Epoch 3, Loss: 0.1241


Epoch 4/5: 100%|██████████| 21705/21705 [00:57<00:00, 377.15it/s]


Epoch 4, Loss: 0.1226


Epoch 5/5: 100%|██████████| 21705/21705 [00:58<00:00, 373.11it/s]

Epoch 5, Loss: 0.1223





In [None]:
#Generating a classification report for the model performance
report= evaluate(model, test_dataloader)
print("Classification Report:")
print(report)

Evaluating: 100%|██████████| 2714/2714 [00:02<00:00, 1272.48it/s]


Classification Report:
                precision    recall  f1-score   support

    admiration       0.72      0.34      0.46       504
     amusement       0.82      0.54      0.65       264
         anger       0.71      0.05      0.09       198
     annoyance       0.00      0.00      0.00       320
      approval       0.84      0.05      0.09       351
        caring       0.00      0.00      0.00       135
     confusion       1.00      0.03      0.05       153
     curiosity       0.49      0.18      0.26       284
        desire       0.55      0.07      0.13        83
disappointment       0.00      0.00      0.00       151
   disapproval       0.00      0.00      0.00       267
       disgust       0.94      0.14      0.24       123
 embarrassment       0.00      0.00      0.00        37
    excitement       0.93      0.13      0.22       103
          fear       0.79      0.19      0.31        78
     gratitude       0.94      0.87      0.90       352
         grief       0.0