# Multi-Task Learning Notebook 

`I hope this is the last iteration`

We are going to import three datasets:
1. Children’s fairy tales (1000-100-100, ekman categorical)
2. Emobank (9000-300-300, vad regression)
3. SemEval-2018 (~7k - 1k - 3k, multi-label)

# Install & imports

In [1]:
!pip install transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch 
import torch.nn as nn
import transformers
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

import os
import math
import copy
import numpy as np 
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from transformers import AutoModel, BertTokenizerFast

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

# set manual seed 
np.random.seed(42)
torch.manual_seed(42)

cuda:0


<torch._C.Generator at 0x7fb244bb6ed0>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import dataset from github

In [4]:
!git clone https://github.com/LeonY117/EmotionAnalysis.git -q

# Read and process data

### Fairy Tale dataset & Emobank

In [5]:
# load the cleaned up dataset from github
CLEAN_DATA_DIR = "/content/EmotionAnalysis/data/clean/"
CHILDREN_filename = "children_highAgree.csv"
EMOBANK_filename = "emobank.csv"

df_children = pd.read_csv(os.path.join(CLEAN_DATA_DIR, CHILDREN_filename))
df_emobank = pd.read_csv(os.path.join(CLEAN_DATA_DIR, EMOBANK_filename))

print(f'Children story dataset: {len(df_children)}')
print(f'Emobank dataset: {len(df_emobank)}')
print(df_children.head())
print(df_emobank.head())

Children story dataset: 1207
Emobank dataset: 9814
   Unnamed: 0                                           sentence  label
0           0  He looked around on every side and exclaimed, ...      4
1           1  Then he got up and clambered out of the cave, ...      3
2           2                    "Alas, thou canst not help me."      3
3           3  They leapt nimbly upstairs and downstairs, and...      2
4           4  Then she opened the door of the small house, a...      2
                    id  split     V     A     D  \
0  110CYL068_1036_1079  train  3.00  3.00  3.20   
1  110CYL068_1079_1110   test  2.80  3.10  2.80   
2  110CYL068_1127_1130  train  3.00  3.00  3.00   
3  110CYL068_1137_1188  train  3.44  3.00  3.22   
4  110CYL068_1189_1328  train  3.55  3.27  3.46   

                                                text  
0        Remember what she said in my last letter? "  
1                          If I wasn't working here.  
2                                            

### Sem-Eval Dataset

Which already has train-val-test split

In [6]:
sem_train_filename = "SemEval2018_train.csv"
sem_val_filename = "SemEval2018_val.csv"
sem_test_filename = "SemEval2018_test.csv"

df_sem_train = pd.read_csv(os.path.join(CLEAN_DATA_DIR, sem_train_filename))
df_sem_val = pd.read_csv(os.path.join(CLEAN_DATA_DIR, sem_val_filename))
df_sem_test = pd.read_csv(os.path.join(CLEAN_DATA_DIR, sem_test_filename))

df_sem_train.head()

Unnamed: 0,text,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,“Worry is a down payment on a problem you may ...,0,1,0,0,0,0,1,0,0,0,1
1,Whatever you decide to do make sure it makes y...,0,0,0,0,1,1,1,0,0,0,0
2,@Max_Kellerman it also helps that the majorit...,1,0,1,0,1,0,1,0,0,0,0
3,Accept the challenges so that you can literall...,0,0,0,0,1,0,1,0,0,0,0
4,My roommate: it's okay that we can't spell bec...,1,0,1,0,0,0,0,0,0,0,0


## Define Global Variables

In [13]:
EKMAN_EMOTIONS = ['anger-disgust', 'fear', 'happy', 'sad', 'surprise']
SEM_EMOTIONS = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']
VAD_EMOTIONS = ['V', 'A', 'D']

# outputs heads (prediction heads)
NUM_CLASSES_EKMAN = len(EKMAN_EMOTIONS) # 5
NUM_CLASSES_SEM = len(SEM_EMOTIONS) # 11
NUM_CLASSES_VAD = len(VAD_EMOTIONS) # 3

OUT_DIMS = {
    'ekman': NUM_CLASSES_EKMAN, 'vad': NUM_CLASSES_VAD, 'sem': NUM_CLASSES_SEM
}

# label lengths (this is how many slots it takes to store the labels)
Y_DIM_EKMAN = 1
Y_DIM_VAD = NUM_CLASSES_VAD
Y_DIM_SEM = NUM_CLASSES_SEM

Y_DIMS = {
    'ekman': Y_DIM_EKMAN, 'vad': Y_DIM_VAD, 'sem': Y_DIM_SEM
}

## Load data from table to numpy array

In [8]:
# Load data into numpy
x_ekman_raw = list(df_children['sentence'])
y_ekman_raw = df_children['label'].to_numpy()

x_vad_raw = list(df_emobank['text'])
y_vad_raw = df_emobank[VAD_EMOTIONS].to_numpy()

x_sem_train_raw = list(df_sem_train['text'])
y_sem_train_raw = df_sem_train[SEM_EMOTIONS].to_numpy()

x_sem_val_raw = list(df_sem_val['text'])
y_sem_val_raw = df_sem_val[SEM_EMOTIONS].to_numpy()

x_sem_test_raw = list(df_sem_test['text'])
y_sem_test_raw = df_sem_test[SEM_EMOTIONS].to_numpy()

# Example:
print(x_ekman_raw[0])
print(y_ekman_raw[0])

print(x_vad_raw[0])
print(y_vad_raw[0])

print(x_sem_train_raw[0])
print(y_sem_train_raw[0])

He looked around on every side and exclaimed, "Oh, heavens, where am I?"
4
Remember what she said in my last letter? "
[3.  3.  3.2]
“Worry is a down payment on a problem you may never have'.  Joyce Meyer.  #motivation #leadership #worry
[0 1 0 0 0 0 1 0 0 0 1]


## Download Bert and tokenizer

In [9]:
# Load the BERT tokenizer
pretrained_checkpoint = 'bert-base-uncased' 

tokenizer = BertTokenizerFast.from_pretrained(pretrained_checkpoint)

# import BERT-base pretrained model
bert = AutoModel.from_pretrained(pretrained_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# Move bert to GPU
for param in bert.parameters():
  param.requires_grad = False
bert.to(DEVICE)
print(f'moved bert to {DEVICE}')

moved bert to cuda:0


### Preprocess with Tokenizer

In [19]:
tokenized_xs = {}

encoded_input = tokenizer(x_ekman_raw, padding=True, truncation=True, return_tensors="pt")
X_ekman_input = encoded_input['input_ids']
X_ekman_mask = encoded_input['attention_mask']
tokenized_xs['ekman'] = [X_ekman_input, X_ekman_mask]

encoded_input = tokenizer(x_vad_raw, padding=True, truncation=True, return_tensors="pt")
X_vad_input = encoded_input['input_ids']
X_vad_mask = encoded_input['attention_mask']
tokenized_xs['vad'] = [X_vad_input, X_vad_mask]

encoded_input = tokenizer(x_sem_train_raw, padding=True, truncation=True, return_tensors="pt")
X_input = encoded_input['input_ids']
X_mask = encoded_input['attention_mask']
tokenized_xs['sem_train'] = [X_input, X_mask]

encoded_input = tokenizer(x_sem_val_raw, padding=True, truncation=True, return_tensors="pt")
X_input = encoded_input['input_ids']
X_mask = encoded_input['attention_mask']
tokenized_xs['sem_val'] = [X_input, X_mask]

encoded_input = tokenizer(x_sem_test_raw, padding=True, truncation=True, return_tensors="pt")
X_input = encoded_input['input_ids']
X_mask = encoded_input['attention_mask']
tokenized_xs['sem_test'] = [X_input, X_mask]

### Preprocess with BERT

In [None]:
BERT_OUT_SIZE = 768
preprocessed_xs = {}

for name, X in tokenized_xs.items():
  # allocate memory
  n = X[0].shape[0]
  preprocessed_xs[name] = torch.empty(size=(n, BERT_OUT_SIZE), dtype=torch.float)

  # use minibatch to process data
  b = 100
  num_batches = math.ceil(n // b)

  print(f'Preprocessing {name}...')
  for i in tqdm(range(num_batches)):
    x_ids = X[0][i*b: (i+1)*b].to(DEVICE)
    x_masks = X[1][i*b: (i+1)*b].to(DEVICE)

    output = bert(x_ids, attention_mask=x_masks)

    hidden_state = output['last_hidden_state']
    pooler_output = output['pooler_output']
    
    preprocessed_xs[name][i*b: (i+1)*b] = pooler_output

Preprocessing ekman...


  0%|          | 0/12 [00:00<?, ?it/s]

Preprocessing vad...


  0%|          | 0/98 [00:00<?, ?it/s]

Preprocessing sem_train...


  0%|          | 0/68 [00:00<?, ?it/s]

Preprocessing sem_val...


  0%|          | 0/8 [00:00<?, ?it/s]

Preprocessing sem_test...


  0%|          | 0/32 [00:00<?, ?it/s]

### Preprocess labels 
All labels need to be the same length, and we pad with 0s

In [None]:
preprocessed_ys = {}

# create placeholder tensors
ekman_zeros = torch.zeros((1, Y_DIMS['ekman']), )
vad_zeros = torch.zeros((1, Y_DIMS['vad']), )
sem_zeros = torch.zeros((1, Y_DIMS['sem']), )

# EKMAN
y = torch.tensor(y_ekman_raw, dtype=torch.float).unsqueeze(-1)
n = y.shape[0]
y = torch.concatenate((y, vad_zeros.repeat(n, 1), sem_zeros.repeat(n, 1)), dim=-1)
preprocessed_ys['ekman'] = y

# VAD
y = torch.tensor(y_vad_raw, dtype=torch.float)
n = y.shape[0]
y = torch.concatenate((ekman_zeros.repeat(n, 1), y, sem_zeros.repeat(n, 1)), dim=-1)

# normalize vad
y = F.normalize(y, dim=-1)
preprocessed_ys['vad'] = y

# sem
y = torch.tensor(y_sem_train_raw, dtype=torch.float)
n = y.shape[0]
y = torch.concatenate((ekman_zeros.repeat(n, 1), vad_zeros.repeat(n, 1), y), dim=-1)
preprocessed_ys['sem_train'] = y

y = torch.tensor(y_sem_val_raw, dtype=torch.float)
n = y.shape[0]
y = torch.concatenate((ekman_zeros.repeat(n, 1), vad_zeros.repeat(n, 1), y), dim=-1)
preprocessed_ys['sem_val'] = y

y = torch.tensor(y_sem_test_raw, dtype=torch.float)
n = y.shape[0]
y = torch.concatenate((ekman_zeros.repeat(n, 1), vad_zeros.repeat(n, 1), y), dim=-1)
preprocessed_ys['sem_test'] = y

In [None]:
# check that everything is alright so far:

for key in preprocessed_ys.keys():
  x = preprocessed_xs[key]
  y = preprocessed_ys[key]

  print(f'{key}: X: {x.shape}, y: {y.shape}')
  # print(f'Example: {x[0]}')
  print(f'label: {y[0]}')
  print('--------------------------------')