## Emotion Classification with an RNN

Small parts of this were taken from the Notebook [here](https://colab.research.google.com/drive/1nwCE6b9PXIKhv2hvbqf1oZKIGkXMTi1X#scrollTo=t23zHggkEpc-), see roberta_emotion_class.ipynb for further info.

## Setup

In [1]:
import torch, fasttext, os
import numpy as np
from torch import nn
from torch.nn.utils.rnn import pack_sequence, pad_packed_sequence
import torch.nn.functional as F
from torch.utils.data import DataLoader
from typing import List
from transformers import AdamW, get_linear_schedule_with_warmup
from huggingface_hub import hf_hub_download
from torchtext.data import get_tokenizer
import pytorch_lightning as pl
from argparse import Namespace
from sklearn.metrics import classification_report
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
os.environ["HF_DATASETS_CACHE"] = "./data"
os.environ["HF_MODELS_CACHE"] = "./model"
os.environ['TRANSFORMERS_CACHE'] = "./model"


## Load and create dataset

In [3]:
## emotion labels
label2int = {
  "sadness": 0,
  "joy": 1,
  "love": 2,
  "anger": 3,
  "fear": 4,
  "surprise": 5
}

emotions = [ "sadness", "joy", "love", "anger", "fear", "surprise"]

In [4]:
model_path = hf_hub_download(repo_id="facebook/fasttext-en-vectors", filename="model.bin", cache_dir="./ft")
print(model_path)
embedding_model = fasttext.load_model(model_path)

./ft/models--facebook--fasttext-en-vectors/snapshots/a80392390daaee1a91000da45c376d512e1dc555/model.bin


In [5]:
class TokenizersCollateFn:
    def __init__(self, max_tokens=512):
        #self.tokenizer = tokenize.NLTKWordTokenizer()
        self.tokenizer = get_tokenizer("basic_english")

    def __call__(self, batch):
        sequences = [self.tokenizer(x[0]) for x in batch]
        embedded_sequences = [torch.tensor(np.array([embedding_model[word] for word in s])) for s in sequences]
        packed_sequences = pack_sequence(embedded_sequences, enforce_sorted=False)
        labels = torch.tensor([x[1] for x in batch])
        #print("packed sequence in collate", packed_sequences.batch_sizes[0])

        return packed_sequences, labels

In [6]:
batch_size = 32

class EmotionDataModule(pl.LightningDataModule):
    def setup(self, stage):
        train = load_dataset("dair-ai/emotion")["train"]
        self.train_dataset = [(ex['text'], ex['label']) for ex in train]
        val = load_dataset("dair-ai/emotion")["validation"]
        self.val_dataset = [(ex['text'], ex['label']) for ex in val]
        test = load_dataset("dair-ai/emotion")["test"]
        self.test_dataset = [(ex['text'], ex['label']) for ex in test]
        
    def train_dataloader(self):    
        return DataLoader(self.train_dataset, batch_size=batch_size, shuffle=True,
                    collate_fn=TokenizersCollateFn(), pin_memory=True, num_workers=4)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=batch_size, shuffle=False,
                    collate_fn=TokenizersCollateFn())
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=batch_size, shuffle=False,
                    collate_fn=TokenizersCollateFn())

## Building the Model

In [7]:
# from https://github.com/digantamisra98/Mish/blob/b5f006660ac0b4c46e2c6958ad0301d7f9c59651/Mish/Torch/mish.py
@torch.jit.script
def mish(input):
    return input * torch.tanh(F.softplus(input))

class Mish(nn.Module):
    def forward(self, input):
        return mish(input)

In [8]:
class RNNEmotionModel(nn.Module):
    def __init__(self, n_classes, embedding_size=300, hidden_size=128, dropout=0.05):
        super().__init__()
        self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, num_layers=1, batch_first=True)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 64),
            Mish(),
            nn.Linear(64, n_classes)
        )

        # for layer in self.classifier:
        #     if isinstance(layer, nn.Linear):
        #         layer.weight.data.normal_(mean=0.0, std=0.02)
        #         if layer.bias is not None:
        #             layer.bias.data.zero_()

    def forward(self, input_, *args):
        X = input_
        y, _ = self.rnn(X)
        # output is shape (N,L,D∗H_out)
        y, lengths = pad_packed_sequence(y, batch_first=True)
        #print("after padding: ", y.size())
        #y = y[:, 0, :] # last hidden state
        y = torch.stack([y[i, lengths[i]-1, :] for i in range(len(lengths))])
        #print("after taking last: ", y.size())
        # for loop
        y = self.classifier(y)
        #print("after linear: ", y.size())
        return y


### Putting the model together

In [9]:
class RNNEmotionClassifier(pl.LightningModule):
    def __init__(self, hparams):
        super().__init__()
        self.model = RNNEmotionModel(len(emotions))
        self.loss = nn.CrossEntropyLoss() ## combines LogSoftmax() and NLLLoss()
        #self.hparams = hparams
        self.hparams.update(vars(hparams))
        self.training_step_outputs = []
        self.max_val_acc = 0

    def step(self, batch, step_name="train"):
        self.train()
        X, y = batch
        loss = self.loss(self.forward(X), y)
        #print(loss)
        self.training_step_outputs.append(loss)
        loss_key = f"{step_name}_loss"
        tensorboard_logs = {loss_key: loss}
        #self.log(loss_key, loss, batch_size=batch_size)

        return { ("loss" if step_name == "train" else loss_key): loss, 'log': tensorboard_logs,
               "progress_bar": {loss_key: loss}}

    def forward(self, X, *args):
        return self.model(X, *args)

    def training_step(self, batch, batch_idx):
        #print(batch[0])
        return self.step(batch, "train")

    def validation_step(self, batch, batch_idx):
        return self.step(batch, "val")

    def validation_end(self, outputs: List[dict]):
        loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        return {"val_loss": loss}

    def test_step(self, batch, batch_idx):
        return self.step(batch, "test")

    #@lru_cache()
    def total_steps(self):
        return self.hparams.data_size // self.hparams.accumulate_grad_batches * self.hparams.epochs

    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=self.hparams.lr)
        lr_scheduler = get_linear_schedule_with_warmup(
                    optimizer,
                    num_warmup_steps=self.hparams.warmup_steps,
                    num_training_steps=self.total_steps(),
        )
        return [optimizer], [{"scheduler": lr_scheduler, "interval": "step"}]

In [10]:
class LossCallback(pl.Callback):
    def on_train_epoch_end(self, trainer, model):
        if trainer.current_epoch % 4 == 0 and trainer.current_epoch > 14:
            epoch_mean = torch.stack(model.training_step_outputs).mean()
            print(epoch_mean.item(), "\n")
            with torch.no_grad():
                model.eval()
                model.cuda()
                
                true_y, pred_y = [], []
                for i, batch_ in enumerate(trainer.datamodule.train_dataloader()):
                    X, y = batch_
                    batch = X.cuda()
                    y_pred = torch.argmax(model(batch), dim=1)
                    true_y.extend(y.cpu())
                    pred_y.extend(y_pred.cpu())
                correct = (torch.tensor(true_y) == torch.tensor(pred_y)).float().sum() / len(true_y)
                print("on train: ", correct.item())
                
                true_y, pred_y = [], []
                for i, batch_ in enumerate(trainer.datamodule.val_dataloader()):
                    X, y = batch_
                    batch = X.cuda()
                    y_pred = torch.argmax(model(batch), dim=1)
                    true_y.extend(y.cpu())
                    pred_y.extend(y_pred.cpu())
                correct = (torch.tensor(true_y) == torch.tensor(pred_y)).float().sum() / len(true_y)
                print("on dev: ", correct.item())
                if correct.item() > model.max_val_acc and correct.item() > 0.66:
                    torch.save(model.state_dict(), "model/rnn")
                    model.max_val_acc = correct.item()
                    print("model saved at ", correct.item())
        model.training_step_outputs.clear()

## Training the Emotion Classifier

In [11]:
hparams = Namespace(
    batch_size=batch_size,
    warmup_steps=100,
    epochs=100,
    lr=2e-4,
    accumulate_grad_batches=1,
    data_size=16000
)

In [12]:
## garbage collection
import gc; gc.collect()
torch.cuda.empty_cache()

In [14]:
model = RNNEmotionClassifier(hparams)
data_module = EmotionDataModule()
trainer = pl.Trainer(max_epochs=hparams.epochs, accumulate_grad_batches=hparams.accumulate_grad_batches, enable_progress_bar=True,
                     callbacks=[LossCallback()])

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/users1/zabereus/zabereus/corpora_env/lib64/python3.12/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [18]:
trainer.fit(model, data_module)

/home/users1/zabereus/zabereus/corpora_env/lib64/python3.12/site-packages/pytorch_lightning/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


## Evaluation

In [12]:
from sklearn.metrics import confusion_matrix
def print_eval(dataloader, save_misclass=False):
    with torch.no_grad():
        progress = ["/", "-", "\\", "|", "/", "-", "\\", "|"]
        model = RNNEmotionClassifier(hparams)
        model.load_state_dict(torch.load("model/rnn"))
        model.eval()
        model.cuda()
        true_y, pred_y = [], []
        for i, batch_ in enumerate(dataloader):
            X, y = batch_
            batch = X.cuda()
            print(progress[i % len(progress)], end="\r")
            y_pred = torch.argmax(model(batch), dim=1)
            true_y.extend(y.cpu())
            pred_y.extend(y_pred.cpu())
    print("\n" + "_" * 80)
    print(classification_report(true_y, pred_y, target_names=label2int.keys(), digits=len(emotions)))
    print(confusion_matrix(true_y, pred_y, labels=label2int.keys()))

    if save_misclass:
        with open("misclass_rnn.txt", "w") as f:
            for i in range(len(y_pred)):
                if true_y[i] != pred_y[i]:  
                    line = "{} true label: {}, predicted label: {}".format(data_module.test_dataset[i][0], true_y[i], pred_y[i])
                    f.write(line)
        f.close()

In [None]:
print_eval(data_module.test_dataloader())

In [23]:
import re
dataset = load_dataset("go_emotions", "simplified")
class GoEmotionDataModule(pl.LightningDataModule):
    def __init__(self) -> None:
        super().__init__()
        self.setup()

    def setup(self, stage=None):
        data = load_dataset("go_emotions", "simplified")
        self.go_dataset = []
        # surprise: 26, sadness 25, joy 17, love 18, anger 2, fear 14
        mapping_dict = {25:0, 17:1, 18:2, 2:3, 14:4, 26:5}
        for ex in data['train']:#.extend(data['test']).extend(data['validation']):
            labels = set(ex['labels']).intersection(mapping_dict.keys())
            if labels:
                text = re.sub(r'[^\w\s]', '', ex['text'].lower()) # remove punctuation, make lowercase
                self.go_dataset.append((text, mapping_dict[list(labels)[0]]))
        print(f"Dataset loaded: {len(self.go_dataset)} entries")
        
    def go_dataloader(self):    
        return DataLoader(self.go_dataset, batch_size=batch_size, shuffle=True,
                    collate_fn=TokenizersCollateFn(), pin_memory=True, num_workers=4)

go_dm = GoEmotionDataModule()
    

print_eval(go_dm.go_dataloader())

Dataset loaded: 7894 entries
\
________________________________________________________________________________
              precision    recall  f1-score   support

     sadness   0.464012  0.457014  0.460486      1326
         joy   0.362676  0.715775  0.481421      1439
        love   0.739884  0.063777  0.117431      2007
       anger   0.360179  0.418454  0.387136      1539
        fear   0.192257  0.524150  0.281325       559
    surprise   0.410646  0.105469  0.167832      1024

    accuracy                       0.355840      7894
   macro avg   0.421609  0.380773  0.315939      7894
weighted avg   0.469269  0.355840  0.312133      7894

