# Workflow Outline
This notebook outlines the workflow for finetuning emotion detection. It should be ran in Google Colab with a GPU

## Install & Import modules

In [None]:
!pip install transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch 
import torch.nn as nn
import transformers
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

import os
import numpy as np 
import matplotlib.pyplot as plt
import tqdm.notebook as tqdm

from transformers import AutoModel, BertTokenizerFast

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Import data

In [None]:
!git clone https://github.com/LeonY117/EmotionAnalysis.git -q

## Set up data pipeline

In [None]:
# load the cleaned up dataset from github
CLEAN_DATA_DIR = "/content/EmotionAnalysis/data/clean/"
EBAT_filename = "EmoBank_AffectiveText.csv"

df = pd.read_csv(os.path.join(CLEAN_DATA_DIR, EBAT_filename))

df.head()

Unnamed: 0,id,split,V,A,D,text,anger,disgust,fear,joy,sadness,surprise
0,1,train,2.29,3.29,2.86,Mortar assault leaves at least 18 dead,22,2,60,0,64,0
1,10,train,3.5,2.88,3.0,Alonso would be happy to retire with three titles,0,0,0,61,24,0
2,100,train,2.88,3.0,3.0,Report criticises US press freedoms,25,24,6,21,13,13
3,1000,train,2.0,3.62,2.75,Terror officials see Al Qaeda chiefs regaining...,13,11,86,0,16,3
4,1001,train,2.8,3.0,3.0,"Ivrea journal: In Italian town, a civics lesso...",0,5,0,3,0,25


In [None]:
# some global variables 
EKMAN_EMOTIONS = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']
VAD_EMOTIONS = ['V', 'A', 'D']

EMOTION_NAMES = ['Ekman', 'VAD']
EMOTION_DIMS = [6, 3]

# Load data into numpy
x_text_raw = list(df['text'])
y_ekman_raw = df[EKMAN_EMOTIONS].to_numpy()
y_vad_raw = df[VAD_EMOTIONS].to_numpy()

# Example:
print(x_text_raw[0])
print(y_ekman_raw[0])
print(y_vad_raw[0])
print(len(x_text_raw))

Mortar assault leaves at least 18 dead
[22  2 60  0 64  0]
[2.29 3.29 2.86]
1149


In [None]:
# Load the BERT tokenizer
pretrained_checkpoint = 'bert-base-uncased' 
# Note: 'bert-base-uncased' is the name of the checkpoint, this should be consistent in both the tokenizer and the AutoModel

tokenizer = BertTokenizerFast.from_pretrained(pretrained_checkpoint)

encoded_input = tokenizer(x_text_raw, padding=True, truncation=True, return_tensors="pt")
# Note: padding matches the length of each row, truncation makes sure that the input length isn't too long for the model (256 for bert-base-uncased)

print(encoded_input.keys())
# Note: the keys we want are input_ids and ateention_mask, token_type_ids is for tasks where you want two sequences as input (e.g. Q&A)

print(encoded_input['input_ids'].shape) 
# Note: the width defaults to the longest sentence in our data

# Example:
print(tokenizer.decode(encoded_input["input_ids"][0])) # you can decode it back into text

X_input = encoded_input['input_ids']
X_mask = encoded_input['attention_mask']

y_ekman_tensor = torch.tensor(y_ekman_raw, dtype=torch.float)
y_vad_tensor = torch.tensor(y_vad_raw, dtype=torch.float)

# optional: delete df and x_text if the dataset is large

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
torch.Size([1149, 21])
[CLS] mortar assault leaves at least 18 dead [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [None]:
# Normalizing y (subject to change)
y_ekman_tensor = F.normalize(y_ekman_tensor, dim=-1)
y_vad_tensor = y_vad_tensor - 3
# VAD could be squished into [0, 1] if needed?
# y_vad_tensor = (y_vad_tensor - 3) * 2 - 1

y = torch.concat((y_ekman_tensor, y_vad_tensor), axis=-1)


### Train-val-test split

* split data set into 80-20 train-test
* split train into 50-50 labelling schemes


In [None]:
def train_val_split(X_ids, X_mask, y, ratio):
  n = X_ids.shape[0]

  perm = torch.randperm(n)

  train_size = int(ratio * n)

  X_ids_train, X_mask_train, y_train = X_ids[perm[:train_size]], X_mask[perm[:train_size]], y[perm[:train_size]]
  X_ids_val, X_mask_val, y_val = X_ids[perm[train_size:]], X_mask[perm[train_size:]], y[perm[train_size:]]

  return X_ids_train, X_mask_train, y_train, X_ids_val, X_mask_val, y_val

In [None]:
# split into train-val
X_ids_train, X_mask_train, y_train, X_ids_test, X_mask_test, y_test = train_val_split(X_input, X_mask, y, ratio=0.8)

# split into two tasks (this is very redundant but makes workflow clearer)
X_ids_ekman, X_mask_ekman, y_ekman, X_ids_vad, X_mask_vad, y_vad = train_val_split(X_ids_train, X_mask_train, y_train, ratio=0.5)

# drop the redundant labels
y_ekman[:, 6:] = torch.zeros_like(y_ekman[:, 6:])
y_vad[:, :6] = torch.zeros_like(y_vad[:, :6])

In [None]:
# attach task labels to the two tasks 
task_ekman = torch.tensor([1, 0]).unsqueeze(0).repeat((len(X_ids_ekman), 1))
task_vad = torch.tensor([0, 1]).unsqueeze(0).repeat((len(X_ids_vad), 1))
task_test = torch.tensor([1, 1]).unsqueeze(0).repeat((len(X_ids_test), 1))

# merge ekman and vad back into one train set 
X_ids_train = torch.concat((X_ids_ekman, X_ids_vad), dim=0)
X_mask_train = torch.concat((X_mask_ekman, X_mask_vad), dim=0)
y_train = torch.concat((y_ekman, y_vad), dim=0)
task_train = torch.concat((task_ekman, task_vad), dim=0)

# Example
id = 600
# print(X_ids_train[id])
# print(X_mask_train[id])
print(y_train[id])
print(task_train[id])

tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1000, 0.0000, 0.1000])
tensor([0, 1])


### Dataset

In [None]:
class EBAT_dataset(Dataset):
    def __init__(self, X_ids, X_mask, y, task, transform=None):
        self.X_ids = X_ids 
        self.X_mask = X_mask
        self.y = y # [0.4, 0.4, 0.2, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0.1, 0.4, 0.1, 0.2, 0.1, 0.1]
        self.task = task # [[1, 0], [0, 1]]
        self.transform = transform

    def __len__(self):
        return self.X_ids.shape[0]

    def __getitem__(self, idx):
        sample = (self.X_ids[idx], self.X_mask[idx], self.y[idx], self.task[idx])
        if self.transform:
            sample = self.transform(sample)
        return sample


In [None]:
train_dataset = EBAT_dataset(X_ids_train, X_mask_train, y_train, task_train)
test_dataset = EBAT_dataset(X_ids_test, X_mask_test, y_test, task_test)

print(f'total data = {len(X_input)}')
print(f'train set size = {len(train_dataset)}')
print(f'test set size = {len(test_dataset)}')

total data = 1149
train set size = 919
test set size = 230


### DataLoader

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size = 10, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size = 10, shuffle=True)

print(f'number of batches in training set: {len(train_dataloader)}')
print(f'number of batches in testing set: {len(test_dataloader)}')

# example batch
sample = next(iter(train_dataloader))
X_id, X_mask, y, task = sample

# example single data
print(X_id[0])
# print(X_mask[0])
print(y[0])
print(task[0])

number of batches in training set: 92
number of batches in testing set: 23
tensor([  101,  6646,  3282,  2386,  5175,  5034, 15878,  7389,  5555,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0])
tensor([0.2526, 0.3248, 0.3608, 0.5413, 0.4330, 0.4691, 0.0000, 0.0000, 0.0000])
tensor([1, 0])


## Model Definition

In [55]:
class MultiheadNetwork(nn.Module):
  def __init__(self, baseModel):
    super().__init__()
    self.baseModel = baseModel 

    # freeze all the parameters in baseModel
    for param in self.baseModel.parameters():
      param.requires_grad = False
    
    self.ekman_predictor = nn.Linear(768, EMOTION_DIMS[0])
    self.vad_predictor = nn.Linear(768, EMOTION_DIMS[1])

    self.relu = nn.ReLU()
    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, x_id, x_mask, task):  
    # pass through base model first
    output = self.baseModel(x_id, attention_mask=x_mask)

    hidden_state = output['last_hidden_state']
    pooler_output = output['pooler_output']

    # print(hidden_state.shape)
    # print(pooler_output.shape)
    # print(task[:, 0].shape)
    # print(task[:, 0].unsqueeze(-1).shape)
    y_ekman = self.ekman_predictor(task[:, 0].unsqueeze(-1) * pooler_output)
    y_ekman = self.relu(y_ekman)

    y_vad = self.vad_predictor(task[:, 1].unsqueeze(-1) * pooler_output)
    y_vad = self.relu(y_vad)
    
    y = torch.concat((y_ekman, y_vad), dim=1)

    return y

In [32]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained(pretrained_checkpoint)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [56]:
net = MultiheadNetwork(bert)

In [57]:
print([m.numel() for m in net.parameters() if m.requires_grad==True])

[4608, 6, 2304, 3]


In [61]:
with torch.no_grad():
  sample = next(iter(train_dataloader))
  X_id, X_mask, y, task = sample

  y = net(X_id, X_mask, task)

## Training Loop

### Loss Function

In [None]:
# MSE

# criterion = 

### Optimizer