# WASSA2023

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Dependencies

In [None]:
repo_path = "https://raw.githubusercontent.com/HLT-Ghisolfi-Leuzzi-Testa/WASSA-2023/"
branch = "main"

In [None]:
!pip install transformers -q
!pip install accelerate -U -q
!pip install datasets -q
!pip install torch-summary -q
!pip install bertviz -q
!pip install NRCLex -q
!pip install textblob -q
!python -m textblob.download_corpora -q

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.


In [None]:
utils_url = f"{repo_path}{branch}/utils.py"
evaluation_url = f"{repo_path}{branch}/evaluation.py"

import os
if os.path.exists("utils.py"):
  !rm "utils.py"
if os.path.exists("evaluation.py"):
  !rm "evaluation.py"

!wget {utils_url}
!wget {evaluation_url}

--2023-07-20 09:34:10--  https://raw.githubusercontent.com/HLT-Ghisolfi-Leuzzi-Testa/WASSA-2023/main/utils.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 40064 (39K) [text/plain]
Saving to: ‘utils.py’


2023-07-20 09:34:10 (11.5 MB/s) - ‘utils.py’ saved [40064/40064]

--2023-07-20 09:34:10--  https://raw.githubusercontent.com/HLT-Ghisolfi-Leuzzi-Testa/WASSA-2023/main/evaluation.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10675 (10K) [text/plain]
Saving to: ‘evaluation.py’


2023-07-20 09:34:10 (70.3 MB/s) - ‘evaluation.

In [None]:
EMO_json_path_test = f"{repo_path}{branch}/datasets/EMO23_lexicon_per_word_test.json"
EMP_json_path_test = f"{repo_path}{branch}/datasets/EMP23_lexicon_per_word_test.json"

!wget {EMO_json_path_test}
!wget {EMP_json_path_test}

In [None]:
import sys
from typing import Optional, Union, Tuple
import json
import random
import string
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn
from torch.utils.data import Dataset
from transformers import (
	TrainingArguments,
	Trainer,
	TrainerCallback,
	EarlyStoppingCallback,
	AutoTokenizer,
	BertModel,
	RobertaModel,
	BertForSequenceClassification,
	RobertaForSequenceClassification
	)
from transformers.modeling_outputs import SequenceClassifierOutput
from nrclex import NRCLex
import importlib
from utils import *
importlib.reload(sys.modules['utils'])

<module 'utils' from '/content/utils.py'>

In [None]:
# set CUDA if available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("======= CUDA Available =======")
else:
    device = torch.device('cpu')
    print("======= CUDA NOT Available, run on CPU =======")



## Set up

In [None]:
EMOTIONS_LEX = [
    'fear',
    'anger',
    'anticipation',
    'trust',
    'surprise',
    'positive',
    'negative',
    'sadness',
    'disgust',
    'joy',
    'hope'
]

EMOTIONS_TO_PREDICT = [
    'sadness',
    'neutral',
    'fear',
    'anger',
    'surprise',
    'joy',
    'hope',
    'disgust'
]

config = {}

In [None]:
#@title Task type
TASK = "EMO" #@param ["EMO", "EMP"]

In [None]:
#zip unzip
best_model_path = ""
checkpoint_path = f"{best_model_path}/checkpoint"
with open(f"{best_model_path}/config.json") as file:
  config = json.load(file)

print("\nCONFIGURATION")
for k,v in config.items():
  print(f"{k}: {v}")

## Dataset

### WASSA dataset

In [None]:
class WASSADataset(Dataset):

    def __init__(
        self,
        tokenizer,
        essay,
        essay_id,
        targets,
        prompt_before_SEP=None,
        prompt_after_SEP=None,
        prompt_inlcusion_prob=0.5,
        EMP_lexicon = None,
        EMO_lexicon = None,
        global_features = None,
        local_emotions = False,
        local_empathy = False,
        local_distress = False,
        max_len=None
        ):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.essay = essay
        self.essay_id = essay_id
        self.targets = targets
        self.EMP_lexicon = EMP_lexicon
        self.EMO_lexicon = EMO_lexicon
        self.prompt_before_SEP = prompt_before_SEP
        self.prompt_after_SEP = prompt_after_SEP
        self.prompt_inlcusion_prob = prompt_inlcusion_prob
        self.global_features = global_features
        self.local_emotions = local_emotions
        self.local_empathy = local_empathy
        self.local_distress = local_distress

    def __len__(self):
        return len(self.essay)

    def __getitem__(self, index):
        essay = str(self.essay[index])
        essay_id = self.essay_id[index]
        mask = self.tokenizer.convert_ids_to_tokens(self.tokenizer.mask_token_id)
        words_to_mask = EMOTIONS_TO_PREDICT + ['low', 'high', 'medium', 'medium-low', 'medium-high']

        prompt_before_SEP = ""
        if self.prompt_before_SEP is not None:
          for p in self.prompt_before_SEP[index]:
            prompt_before_SEP += " " + str(p)

        prompt_after_SEP = ""
        random_float = random.random()
        if self.prompt_after_SEP is not None:
          for p in self.prompt_after_SEP[index]:
            prompt_after_SEP += " " + str(p)
          if random_float < self.prompt_inlcusion_prob:
            for word in words_to_mask:
              prompt_after_SEP = prompt_after_SEP.replace(word, mask)

        text = essay
        text_pair = None
        if prompt_before_SEP != "":
          text += str(prompt_before_SEP)
        if prompt_after_SEP != "":
          text_pair = str(prompt_after_SEP)

        inputs = self.tokenizer.encode_plus(
            text=essay,
            text_pair=text_pair,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
            return_token_type_ids=True
        )

        item = {
          'input_ids': inputs['input_ids'].flatten(),
          'attention_mask': inputs['attention_mask'].flatten(),
          'token_type_ids': inputs["token_type_ids"].flatten()
        }

        if self.targets is not None:
          item['labels'] = torch.FloatTensor(self.targets[index])

        if self.global_features is not None:
          item['global_features'] = self.global_features[index]

        n_local_features = 0
        features_tokens_row = []
        if self.local_emotions:
          n_local_features += len(EMOTIONS_LEX)
          for i in range(len(EMOTIONS_LEX)):
            features_tokens_row.append(0)
        if self.local_empathy:
          n_local_features += 1
          features_tokens_row.append(4)
        if self.local_distress:
          n_local_features += 1
          features_tokens_row.append(0)

        if n_local_features > 0:
          features_tokens = np.full((self.tokenizer.model_max_length, n_local_features), features_tokens_row)
        else:
          features_tokens = None

        word_count=0
        first_char=True
        last_char_is_space=False
        for char_idx, char in enumerate(essay):
          token_idx = inputs.char_to_token(char_idx)
          if token_idx is None:
            if first_char: last_char_is_space=True
            if not last_char_is_space and not first_char:
              word_count+=1
              last_char_is_space=True
            continue
          elif last_char_is_space:
            last_char_is_space=False
          first_char=False

          j = 0
          if char not in string.punctuation:
            if self.local_emotions:
              for i, emo in enumerate(EMOTIONS_LEX):
                features_tokens[token_idx][i] = self.EMO_lexicon[str(essay_id)][emo][word_count]
              j += len(EMOTIONS_LEX)

            if self.local_empathy:
              features_tokens[token_idx][j] = self.EMP_lexicon[str(essay_id)]['empathy'][word_count]
              j += 1


            if self.local_distress:
              features_tokens[token_idx][j] = self.EMP_lexicon[str(essay_id)]['distress'][word_count]

        if features_tokens is not None:
            item['local_features'] = torch.FloatTensor(features_tokens)

        return item

Read dataframes

In [None]:
TEST_DATA = f"{repo_path}{branch}/datasets/WASSA23_essay_level_test_preproc.tsv"

test_df = pd.read_csv(TEST_DATA, sep='\t')

if TASK=="EMP":
  train_df = add_emp_dist_levels(train_df)
  train_df = add_prompt_truth(train_df, TASK, "3")
  val_df = add_emp_dist_levels(val_df)
  val_df = add_prompt_truth(val_df, TASK, "3")
  dev_df = add_emp_dist_levels(dev_df)
  dev_df = add_prompt_truth(dev_df, TASK, "3")
if TASK=="EMO":
  train_df = add_prompt_truth(train_df, TASK)
  val_df = add_prompt_truth(val_df, TASK)
  dev_df = add_prompt_truth(dev_df, TASK)

Encode targets

In [None]:
if TASK =="EMO":
  if TASK == "EMO" and config.get('num_labels') == 7:
    label_encoder = EmotionsLabelEncoderNeutral()
  else:
    label_encoder = EmotionsLabelEncoder()
  label_encoder.fit(train_df.emotion)
  y_train = label_encoder.encode(train_df.emotion)
  y_val = label_encoder.encode(val_df.emotion)
  y_dev = label_encoder.encode(dev_df.emotion)

if TASK == "EMP":
  y_train = np.array(train_df[['empathy', 'distress']])
  y_val = np.array(val_df[['empathy', 'distress']])
  y_dev = np.array(dev_df[['empathy', 'distress']])



Extra global features

In [None]:
global_features_train = None
global_features_val = None
global_features_dev = None

if len(config.get('global_features_names')) > 0:
  scaler = StandardScaler()
  scaler.fit(np.array(train_df[config.get('global_features_names')]))

  def standard_scalar_features(features):
    return scaler.transform(features)

  global_features_train =  standard_scalar_features(np.array(train_df[config.get('global_features_names')]))
  global_features_val =  standard_scalar_features(np.array(val_df[config.get('global_features_names')]))
  global_features_dev =  standard_scalar_features(np.array(dev_df[config.get('global_features_names')]))

  if TASK == "EMP" and config.get('gold_emotions'):
    label_encoder = EmotionsLabelEncoder()
    label_encoder.fit(train_df.emotion)
    gold_emotions_train = label_encoder.encode(train_df.emotion)
    gold_emotions_val = label_encoder.encode(val_df.emotion)
    gold_emotions_dev = label_encoder.encode(dev_df.emotion)
    global_features_train = np.concatenate((global_features_train, gold_emotions_train), axis = 1)
    global_features_val = np.concatenate((global_features_val, gold_emotions_val), axis = 1)
    global_features_dev = np.concatenate((global_features_dev, gold_emotions_dev), axis = 1)

Prompt

In [None]:
prompt_before_SEP_train = None
prompt_after_SEP_train = None
prompt_before_SEP_val = None
prompt_after_SEP_val = None
prompt_before_SEP_dev = None
prompt_after_SEP_dev = None

if len(config.get('prompt_names_before_SEP')) > 0:
  prompt_before_SEP_train = np.array(train_df[config.get('prompt_names_before_SEP')])
  prompt_before_SEP_val = np.array(val_df[config.get('prompt_names_before_SEP')])
  prompt_before_SEP_dev = np.array(dev_df[config.get('prompt_names_before_SEP')])

if len(config.get('prompt_names_after_SEP')) > 0:
  prompt_after_SEP_train = np.array(train_df[config.get('prompt_names_after_SEP')])
  prompt_after_SEP_val = np.array(val_df[config.get('prompt_names_after_SEP')])
  prompt_after_SEP_dev = np.array(dev_df[config.get('prompt_names_after_SEP')])

Lexicons

In [None]:
EMO_lexicon_train_dict = None
EMP_lexicon_train_dict = None
EMO_lexicon_dev_dict = None
EMP_lexicon_dev_dict = None
EMO_lexicon_test_dict = None
EMP_lexicon_test_dict = None
if config.get('local_features_names') is not None:
  if 'emotions' in config.get('local_features_names'):
    with open("/content/EMO23_lexicon_per_word_.json") as json_file:
      EMO_lexicon_train_dict = json.load(json_file)
    with open("/content/EMO23_lexicon_per_word_test.json") as json_file:
      EMO_lexicon_test_dict = json.load(json_file)

  if 'empathy' in config.get('local_features_names') or 'distress' in config.get('local_features_names'):
    with open("/content/EMP23_lexicon_per_word_.json") as json_file:
      EMP_lexicon_train_dict = json.load(json_file)
    with open("/content/EMP23_lexicon_per_word_test.json") as json_file:
      EMP_lexicon_test_dict = json.load(json_file)

## Model

In [None]:
def get_loss_weights(y, method):
  if method == 'balanced':
    weights_train = y.shape[0] / (y.shape[1] * np.sum(y, axis=0))
  else:
    inverse_n_samples = 1 / np.sum(y, axis=0)
    sum_inverses = sum(inverse_n_samples)
    weights_train = inverse_n_samples / sum_inverses
  return torch.cuda.FloatTensor(weights_train)

In [None]:
loss_weights_train = None
if model_config.get('weighted_loss')!='None':
  loss_weights_train = get_loss_weights(y_train, model_config.get('weighted_loss'))

### Custom model

In [None]:
class ClassificationHead(nn.Module):
    #Head for sentence-level classification tasks.

    def __init__(self, config, dim_extra_features, hidden_layers_to_concat, classifier_dropout, local_features, mean_last_cls):
        super().__init__()
        self.local_features = local_features

        if mean_last_cls:
          total_dims = config.hidden_size + dim_extra_features
        else:
          total_dims = config.hidden_size*hidden_layers_to_concat + dim_extra_features
        if self.local_features:
          total_dims += config.hidden_size
        self.dense = nn.Linear(total_dims, total_dims)
        self.dropout = nn.Dropout(classifier_dropout)
        self.out_proj = nn.Linear(total_dims, config.num_labels)

    def forward(self, features, **kwargs):
        features = features.to(torch.float32) # by default float32 is used as the dtype
        x = self.dense(features)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class CustomSequenceClassification(model_class):

    def __init__(self, config, dim_extra_features=0, model_class = None, local_features_names=None, loss_weights=None, n_last_cls = 1, mean_last_cls = False, concat_local_features = False):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config
        self.local_features_names = local_features_names
        self.loss_weights = loss_weights
        self.n_last_cls = n_last_cls
        self.mean_last_cls = mean_last_cls
        self.concat_local_features = concat_local_features
        self.model_class = model_class
        if self.model_class == "BertPreTrainedModel":
          self.bert = BertModel(config)

        if self.model_class == "RobertaPreTrainedModel":
          self.roberta = RobertaModel(config)

        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = ClassificationHead(config,
                                            dim_extra_features,
                                            n_last_cls,
                                            classifier_dropout,
                                            local_features_names is not None,
                                            mean_last_cls)
        self.post_init()


    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        local_features = None,
        global_features = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = True,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if self.model_class == "BertPreTrainedModel":
          outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        if self.model_class == "RobertaPreTrainedModel":
          outputs = self.roberta(
              input_ids,
              attention_mask=attention_mask,
              token_type_ids=token_type_ids,
              position_ids=position_ids,
              head_mask=head_mask,
              inputs_embeds=inputs_embeds,
              output_attentions=output_attentions,
              output_hidden_states=output_hidden_states,
              return_dict=return_dict,
          )

        cls_tokens = []
        for i in range(1, self.n_last_cls + 1):
            cls_tokens.append(outputs.hidden_states[-1 * i][:, 0, :])
        if self.mean_last_cls:
          # average cls tokens
          output = torch.mean(torch.stack(cls_tokens), dim=0)
        else:
          # concat cls tokens
          output = torch.cat(cls_tokens, dim=1)

        output = self.dropout(output)

        if local_features is not None:
          tokens_output = outputs.last_hidden_state
          tokens_output = self.dropout(tokens_output)

          if self.concat_local_features:
            tokens_output = torch.cat((
                    tokens_output,
                    local_features.reshape(outputs.last_hidden_state.shape[0], outputs.last_hidden_state.shape[1], -1)),
                              dim=2)

          mask = torch.zeros_like(attention_mask)
          # unmask tokens with high or low empathy, high distress levels, or expressing at least one emotion
          j = 0
          if 'emotions' in self.local_features_names:
            emotion_values = local_features[:,:,:11]
            mask[emotion_values.sum(dim=-1)>=1] = 1.0
            j += 11

          if 'empathy' in self.local_features_names:
            empathy_values = local_features[:,:,j]
            mask[(empathy_values>5) | ((empathy_values<3) & (empathy_values>=1))] = 1.0
            j += 1

          if 'distress' in self.local_features_names:
            distress_values = local_features[:,:,j]
            mask[distress_values>4] = 1.0

          # mean pooling of unmasked tokens
          input_mask_expanded = mask.unsqueeze(-1).expand(tokens_output.size()).float()
          sum_embeddings = torch.sum(tokens_output * input_mask_expanded, 1)
          sum_mask = input_mask_expanded.sum(1)
          sum_mask = torch.clamp(sum_mask, min = 1e-9)
          tokens_output = sum_embeddings/sum_mask

          # concat pooled tokens lexically relevant with cls token
          output = torch.cat((output, tokens_output), dim=-1)


        if global_features is not None: # global
          output = torch.cat((output, global_features), dim=-1)

        logits = self.classifier(output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = nn.MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = nn.CrossEntropyLoss(weight = self.loss_weights)
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = nn.BCEWithLogitsLoss(weight = self.loss_weights)
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.get('tokenizer_name'), truncation=True)
tokenizer

RobertaTokenizerFast(name_or_path='SamLowe/roberta-base-go_emotions', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True)

In [None]:
if TASK == "EMO":
  problem_type = "multi_label_classification"
if TASK == "EMP":
  problem_type = "regression"

model = CustomSequenceClassification.from_pretrained(
                                            checkpoint_path,
                                            problem_type = problem_type,
                                            classifier_dropout = config.get('dropout'),
                                            model_class = config.get('model_class_string'),
                                            num_labels=config.get('num_labels'),
                                            dim_extra_features = config.get('dim_extra_features'),
                                            local_features_names = config.get('local_features_names'),
                                            n_last_cls = config.get('n_last_cls'),
                                            mean_last_cls = config.get('mean_last_cls'),
                                            concat_local_features = config.get('concat_local_features'),
                                            loss_weights = loss_weights_train,
                                            ignore_mismatched_sizes=True
                                            )
model

Some weights of CustomSequenceClassification were not initialized from the model checkpoint at SamLowe/roberta-base-go_emotions and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of CustomSequenceClassification were not initialized from the model checkpoint at SamLowe/roberta-base-go_emotions and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([768, 768]) in the checkpoint and torch.Size([773, 773]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([768]) in the checkpoint and torch.Size([773]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([28, 768]) in the checkpoint and torch.Size([7, 773]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([28]) in the checkpoint and torch.Size([7]) in 

CustomSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

## Test

In [None]:
#TODO: if??

test_df = add_prompt_truth(test_df, TASK)

num_missing_features = 0
if config['gold_emotions']:
    num_missing_features += 8
if config['gold_empathy']:
    num_missing_features += 1
if config['gold_distress']:
    num_missing_features += 1

global_features = standard_scalar_features(np.array(test_df[config.get('global_features_names')]))
if num_missing_features > 0:
    global_features = np.concatenate((global_features, np.zeros((100, num_missing_features), int)), axis=1)

In [None]:
test_set = WASSADataset(
                        tokenizer=tokenizer,
                        essay = test_df.essay,
                        prompt_before_SEP = prompt_before_SEP_test,
                        prompt_after_SEP = prompt_after_SEP_test,
                        essay_id = test_df.essay_id,
                        targets = None,
                        global_features = global_features_test,
                        prompt_inlcusion_prob = 1,
                        EMO_lexicon = EMO_lexicon_test_dict,
                        EMP_lexicon = EMP_lexicon_test_dict,
                        local_emotions = config.get('emo_count_local'),
                        local_empathy = config.get('empathy_count_local'),
                        local_distress = config.get('distress_count_local')
                      )

In [None]:
trainer = Trainer(model=model)
outs = trainer.predict(test_set)

In [None]:
if TASK == "EMO":
  golds = label_encoder.decode(outs.label_ids)
  predictions = predict_emotions(outs.predictions[0], False)
if TASK == "EMP":
  golds = outs.label_ids
  predictions = outs.predictions[0]

In [None]:
path_predictions = f"predictions_{TASK}.tsv"
write_predictions(predictions, path_predictions)

from google.colab import files
files.download(path_predictions)

## Inference

In [None]:
#@title Predict

text = 'WRITE HERE' #@param {type:"string"}

model(text)

inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)

# Make the prediction
with torch.no_grad():
    outputs = model(**inputs)

if TASK == "EMO":
  prediction = predict_emotions(outputs.predictions[0], False)
if TASK == "EMP":
  prediction = outputs.predictions[0]

print(f"Predicted: {prediction}")