# NLP Project Demo 

**Title: Multi-task learning for Text-based Emotion Detection across disparate label spaces**

### What's in this notebook


### How to run this notebook
This notebook is completely self-contained and runnable in a google colab environment


## Install & import

In [1]:
!pip install transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m96.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m109.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch 
import torch.nn as nn
import transformers
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import ones_like, zeros_like

import os
import math
import copy
import numpy as np 
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm

from transformers import AutoModel, BertTokenizerFast

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

# set manual seed 
np.random.seed(42)
torch.manual_seed(42)

cuda:0


<torch._C.Generator at 0x7fce8c0c60f0>

In [3]:
!git clone https://github.com/LeonY117/EmotionAnalysis.git -q

In [4]:
# load the cleaned up dataset from github
CLEAN_DATA_DIR = "/content/EmotionAnalysis/data/clean/"
CHILDREN_filename = "children_test.csv"
EMOBANK_filename = "emobank_test.csv"
SEM_filename = "SemEval2018_test.csv"

df_children = pd.read_csv(os.path.join(CLEAN_DATA_DIR, CHILDREN_filename))
df_emobank = pd.read_csv(os.path.join(CLEAN_DATA_DIR, EMOBANK_filename))
df_sem = pd.read_csv(os.path.join(CLEAN_DATA_DIR, SEM_filename))

print(f'Fairy Tale: {len(df_children)}, EmoBank: {len(df_emobank)}, SemEval: {len(df_sem)}')

Fairy Tale: 122, EmoBank: 982, SemEval: 3259


### Define some global variables

In [5]:
EKMAN_EMOTIONS = ['anger-disgust', 'fear', 'happy', 'sad', 'surprise']
SEM_EMOTIONS = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']
VAD_EMOTIONS = ['V', 'A', 'D']

# outputs heads (prediction heads)
NUM_CLASSES_EKMAN = len(EKMAN_EMOTIONS) # 5
NUM_CLASSES_SEM = len(SEM_EMOTIONS) # 11
NUM_CLASSES_VAD = len(VAD_EMOTIONS) # 3

OUT_DIMS = {
    'ekman': NUM_CLASSES_EKMAN, 'vad': NUM_CLASSES_VAD, 'sem': NUM_CLASSES_SEM
}

# label lengths (this is how many slots it takes to store the labels)
Y_DIM_EKMAN = 1
Y_DIM_VAD = NUM_CLASSES_VAD
Y_DIM_SEM = NUM_CLASSES_SEM

Y_DIMS = {
    'ekman': Y_DIM_EKMAN, 'vad': Y_DIM_VAD, 'sem': Y_DIM_SEM
}

### Process dataframe into tensors

In [6]:
# load into np arrays
# Load data into numpy
x_ekman = list(df_children['sentence'])
y_ekman = df_children['label']

x_vad = list(df_emobank['text'])
y_vad = df_emobank[VAD_EMOTIONS].to_numpy()

x_sem = list(df_sem['text'])
y_sem = df_sem[SEM_EMOTIONS].to_numpy()

# Example:
print(x_ekman[0])
print(y_ekman[0])

print(x_vad[0])
print(y_vad[0])

print(x_sem[0])
print(y_sem[0])

"My sweet child, my golden treasure!" cried the mother, and she wept; but the Fire-drum sang, not out loud, but inwardly.
3
Here are four packages with our most popular titles for you at incredibly low prices.
[3.6 3.1 3.3]
@Adnan__786__ @AsYouNotWish Dont worry Indian army is on its ways to dispatch all Terrorists to Hell
[1 1 0 0 0 0 1 0 0 0 1]


In [7]:
# generate task labels
task_ekman = torch.tensor([1, 0, 0]).unsqueeze(0).repeat((len(x_ekman), 1))
task_vad = torch.tensor([1, 0, 0]).unsqueeze(0).repeat((len(x_vad), 1))
task_sem = torch.tensor([1, 0, 0]).unsqueeze(0).repeat((len(x_sem), 1))

In [8]:
# preprocess ys
# All labels need to be the same length, and we pad with 0s

# create placeholder tensors
ekman_zeros = torch.zeros((1, Y_DIMS['ekman']), )
vad_zeros = torch.zeros((1, Y_DIMS['vad']), )
sem_zeros = torch.zeros((1, Y_DIMS['sem']), )

# EKMAN
y = torch.tensor(y_ekman, dtype=torch.float).unsqueeze(-1)
n = y.shape[0]
y_ekman = torch.concatenate((y, vad_zeros.repeat(n, 1), sem_zeros.repeat(n, 1)), dim=-1)

# VAD
y = torch.tensor(y_vad, dtype=torch.float)
n = y.shape[0]
y_vad = torch.concatenate((ekman_zeros.repeat(n, 1), y, sem_zeros.repeat(n, 1)), dim=-1)

# sem
y = torch.tensor(y_sem, dtype=torch.float)
n = y.shape[0]
y_sem = torch.concatenate((ekman_zeros.repeat(n, 1), vad_zeros.repeat(n, 1), y), dim=-1)

### Create dataset and dataloader

In [9]:
class Emotion_dataset(Dataset):
  def __init__(self, X, y, task):
    self.X = X
    self.y = y 
    self.task = task 

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    sample = (self.X[idx], self.y[idx], self.task[idx])
    return sample

In [10]:
x_all = x_ekman + x_vad + x_sem
y_all = torch.concatenate((y_ekman, y_vad, y_sem), dim=0)
task_all = torch.concatenate((task_ekman, task_vad, task_sem), dim=0)

datasets = {}

datasets['ekman'] = Emotion_dataset(x_ekman, y_ekman, task_ekman)
datasets['vad'] = Emotion_dataset(x_vad, y_vad, task_vad)
datasets['sem'] = Emotion_dataset(x_sem, y_sem, task_sem)
datasets['all'] = Emotion_dataset(x_all, y_all, task_all)

dataloaders = {}
dataloaders['ekman'] = DataLoader(datasets['ekman'], batch_size = 16, shuffle=True)
dataloaders['vad'] = DataLoader(datasets['vad'], batch_size = 16, shuffle=True)
dataloaders['sem'] = DataLoader(datasets['sem'], batch_size = 16, shuffle=True)
dataloaders['all'] = DataLoader(datasets['all'], batch_size = 16, shuffle=True)

## Model Definition

### Download tokenizer & Bert

In [11]:
# Load the BERT tokenizer
pretrained_checkpoint = 'bert-base-uncased' 

TOKENIZER = BertTokenizerFast.from_pretrained(pretrained_checkpoint)

# import BERT-base pretrained model
BERT = AutoModel.from_pretrained(pretrained_checkpoint)

BERT.to(DEVICE)

# Freeze bert and move it to GPU
for param in BERT.parameters():
  param.requires_grad = False
BERT.to(DEVICE)
print(f'moved bert to {DEVICE}')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


moved bert to cuda:0


### Multihead 

This is the class which contains the shared base and the predictors

In [12]:
class MultiheadNetwork(nn.Module):
  def __init__(self, h_size=256, dropout=0):
    super().__init__()
    
    self.shared_base = nn.Linear(768, h_size)
    self.ekman_predictor = nn.Linear(h_size, OUT_DIMS['ekman'])
    self.vad_predictor = nn.Linear(h_size, OUT_DIMS['vad'])
    self.sem_predictor = nn.Linear(h_size, OUT_DIMS['sem'])

    self.dropout = nn.Dropout(p=dropout, inplace=False)
    self.relu = nn.ReLU()
    # self.softmax = nn.Softmax(dim=1)
    self.sigmoid = nn.Sigmoid()
    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, X, task):  
    
    X = self.relu(self.shared_base(X))
    X = self.dropout(X)

    ekman_filter = task[:, 0].unsqueeze(-1)
    y_ekman = ekman_filter * self.ekman_predictor(X)
    y_ekman = self.softmax(y_ekman)

    vad_filter = task[:, 1].unsqueeze(-1)
    y_vad = vad_filter * self.vad_predictor(X)
    y_vad = self.relu(y_vad)

    sem_filter = task[:, 2].unsqueeze(-1)
    y_sem = sem_filter * self.sem_predictor(X)
    y_sem = self.sigmoid(y_sem)

    y = torch.concat((y_ekman, y_vad, y_sem), dim=1)

    return y

### Full model

This includes the preprocessing steps

In [13]:
class MTL_network(nn.Module):
  def __init__(self, predictor):
    super().__init__()
    self.tokenizer = TOKENIZER # should be a global variable
    self.bert = BERT # should be a global variable
    self.predictor = predictor
  
  def forward(self, sent, task):
    '''
    Args:
    -----
    sent: (n, ) array of sentences
    task: (n, 3) tensor of binary flags indicating if each task is on / off
    '''
    # tokenize sentence
    encoded_input = self.tokenizer(sent, padding=True, truncation=True, return_tensors="pt")

    # extract tokenized data and move to device
    X_input = encoded_input['input_ids'].to(DEVICE)
    X_mask = encoded_input['attention_mask'].to(DEVICE)

    # bert forward pass
    feature = self.bert(X_input, attention_mask=X_mask)['pooler_output']

    out = self.predictor(feature, task)
    
    return out

## Metrics

In [14]:
def compute_F1(y_pred, y_gt, mask, detailed=False):
  '''
  Args
  -----
  y_pred: (n x 5)
  y_gt: (n x 1)
  mask: (n x 1)
  detailed: if True, return F1 for every class

  Returns
  -----
  Jaccard_accuracy: float
  '''
  gt_class = y_gt.to(int).squeeze(dim=1)
  pred_class = torch.argmax(y_pred, dim=-1).to(int)
  mask = mask.squeeze(dim=1).to(int)

  F1s = []
  TPs, FPs, FNs = 0, 0, 0
  for c in range(NUM_CLASSES_EKMAN):
    TP = ((gt_class==c)&(pred_class==c)&(mask==1)).sum()
    FP = ((pred_class==c)&(gt_class!=c)&(mask==1)).sum()
    FN = ((pred_class!=c)&(gt_class==c)&(mask==1)).sum()
    if detailed:
      F1s.append(TP/ (TP + 0.5 * (FP + FN) + 1e-16))
    TPs += TP
    FPs += FP
    FNs += FN
  
  # print(f'TP: {TPs}, FP: {FPs}, FN: {FNs}')
  F1 = TPs/ (TPs + 0.5 * (FPs + FNs) + 1e-16)

  if detailed: 
    output = (F1, F1s)
  else:
    output = F1
  return output

def compute_Jaccard(y_pred, y_gt, mask):
  '''
  Args
  -----
  y_pred: (n x 11)
  y_gt: (n x 11)
  mask: (n x 1)

  Returns
  -----
  Jaccard_accuracy: float
  '''

  n = mask.sum() + 1e-16
  y_pred = y_pred > 0.5
  intersect = ((y_pred==1)&(y_gt==1))*mask
  union = (((y_pred==1)|(y_gt==1)))*mask
  jaccards = intersect.sum(dim=-1) / (union.sum(dim=-1)+1e-16)
  jaccard = 1/n * jaccards.sum()

  return jaccard

  
def compute_corr(y_pred, y_gt, mask):
  '''
  Args
  -----
  y_pred: (n x 3)
  y_gt: (n x 3)
  mask: (n x 1)

  Returns
  -----
  Pearson Correlation Coefficient: arr
  '''
  rs = [0, 0, 0]

  y_pred_avg = (y_pred * mask).sum(dim=0) / (mask.sum() + 1e-16)
  y_gt_avg = (y_gt * mask).sum(dim=0) / (mask.sum() + 1e-16)

  for i in range(3):
    a = (y_pred[:, i] - y_pred_avg[i]) * mask.squeeze()
    b = (y_gt[:, i] - y_gt_avg[i]) * mask.squeeze()

    r = (a * b).sum() / (torch.sqrt((a * a).sum() * (b * b).sum()) + 1e-16)

    rs[i] = r.item()

  return rs


class MultiTaskMetric(object):
  def __init__(self):
    pass

  def __call__(self, y_pred, y_gt, task):
    ekman_count = task[:, 0].sum() + 1e-16
    vad_count = task[:, 1].sum() + 1e-16
    sem_count = task[:, 2].sum() + 1e-16

    metric = torch.zeros(3, dtype=torch.float, device=y_pred.device)

    # F1
    s1, f1 = 0, OUT_DIMS['ekman']
    s2, f2 = 0, Y_DIMS['ekman']
    pred = y_pred[:, s1:f1]
    gt = y_gt[:, s2:f2]
    mask = task[:, 0:1]
    F1 = compute_F1(pred, gt, mask)

    # Regression
    s1, f1 = s1+OUT_DIMS['ekman'], f1+OUT_DIMS['vad']
    s2, f2 = s2+Y_DIMS['ekman'], f2+Y_DIMS['vad']
    pred = y_pred[:, s1:f1]
    gt = y_gt[:, s2:f2]
    mask = task[:, 1:2]
    MSE = compute_corr(pred, gt, mask)
    
    # Jaccard
    s1, f1 = s1+OUT_DIMS['vad'], f1+OUT_DIMS['sem']
    s2, f2 = s2+Y_DIMS['vad'], f2+Y_DIMS['sem']
    pred = y_pred[:, s1:f1]
    gt = y_gt[:, s2:f2]
    mask = task[:, 2:]
    Jaccard = compute_Jaccard(pred, gt, mask)

    return F1, MSE, Jaccard



## Load pretrained predictor

In [16]:
MODEL_FOLDER = '/content/EmotionAnalysis/saved_models/'
MODEL_NAME = 'MHMTL.pt'

predictor_net = MultiheadNetwork()
ckpt = torch.load(os.path.join(MODEL_FOLDER, MODEL_NAME), map_location=DEVICE)
predictor_net.load_state_dict(ckpt)

# model.load_state_dict(checkpoint['model_state_dict'])

net = MTL_network(predictor_net)

FileNotFoundError: ignored

## Quantitative Evaluation over all test sets

In [None]:
# # Helper function to turn y_pred into predictions

# def get_prediction(y_pred, task):
#   '''
#   Args:
#   ----------
#   y_pred: (n x 19) tensor 
#   task: (n x 3) tensor 

#   Returns:
#   ----------
#   out: (n x 15), 0: ekman class, 1 to 3: vad, 4 to 15: SEM labels
#   '''
#   n = y_pred.shape[0]
#   y_out = torch.zeros(n, sum(Y_DIMS.values()))

#   ekman_filter = task[:, 0]
#   ekman_pred = torch.argmax(y_pred[:, :OUT_DIMS['ekman']])
#   y_pred[:, :Y_DIMS['ekman']] = ekman_pred

In [None]:
metric = MultiTaskMetric()

n = len(dataloaders['all'].dataset)
y_preds = torch.empty(n, sum(OUT_DIMS.values()), dtype=torch.float) # n x 19
y_gts = torch.empty(n, sum(Y_DIMS.values()), dtype=torch.float) # n x 15
tasks = torch.empty(n, 3) # 3
i = 0
for X, y, task in tqdm(dataloaders['all']):
  b = len(X)
  y_pred = net(X, task)
  y_preds[i:i+b] = y_pred
  y_gts[i:i+b] = y
  tasks[i:i+b] = task
  i += b

f1, r, jaccard = metric(y_preds, y_gts, tasks)

print(f'F1 = {f1:.4f}')
print(f'Correlation = V: {r[0]:.4f}, A: {r[1]:.4f}, D: {r[2]:.4f}, r: {r.mean():4f}')
print(f'Jaccard = {jaccard:.4f}')