In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import pandas as pd

In [3]:
DATA_PATH_TRAIN = "https://raw.githubusercontent.com/HLT-Ghisolfi-Leuzzi-Testa/WASSA-2023/main/datasets/WASSA23_essay_level_train_preproc.tsv"
df_train = pd.read_csv(DATA_PATH_TRAIN, sep='\t')
df_train.head(10)

Unnamed: 0,conversation_id,article_id,essay,empathy,distress,speaker_id,gender,education,race,age,...,essay_id,emotion,Surprise,Hope,Neutral,Sadness,Joy,Fear,Anger,Disgust
0,2,35,It breaks my heart to see people living in tho...,6.833333,6.625,30,1,6,3,37,...,1,Hope/Sadness,0,1,0,1,0,0,0,0
1,3,35,I wonder why there aren't more people trying t...,5.833333,6.0,19,1,6,2,32,...,2,Anger,0,0,0,0,0,0,1,0
2,5,35,"After reading the article, you can't help but ...",1.0,1.375,17,1,6,1,29,...,4,Sadness,0,0,0,1,0,0,0,0
3,6,213,It is so sad that someone who had such an amaz...,6.166667,6.625,16,2,5,1,28,...,5,Sadness,0,0,0,1,0,0,0,0
4,8,213,"From reading the article, it looks like the wo...",6.833333,1.0,30,1,6,3,37,...,7,Neutral,0,0,1,0,0,0,0,0
5,10,213,That's sad. Regardless of what they find out ...,1.666667,1.125,49,1,5,1,31,...,9,Sadness,0,0,0,1,0,0,0,0
6,11,78,"After reading the article, my reaction is that...",1.5,1.0,17,1,6,1,29,...,10,Sadness,0,0,0,1,0,0,0,0
7,13,78,It sounds like these boys had a really rough l...,2.0,1.0,24,2,7,1,38,...,12,Sadness,0,0,0,1,0,0,0,0
8,14,78,This is a tragic and sad story about how some ...,6.0,3.0,43,2,6,1,33,...,13,Sadness,0,0,0,1,0,0,0,0
9,17,336,Hello. I feel really terrible about the curren...,7.0,1.0,31,unknown,unknown,unknown,unknown,...,16,Disgust/Sadness,0,0,0,1,0,0,0,1


In [4]:
DATA_PATH_DEV = "https://raw.githubusercontent.com/HLT-Ghisolfi-Leuzzi-Testa/WASSA-2023/main/datasets/WASSA23_essay_level_dev_preproc.tsv"
df_dev = pd.read_csv(DATA_PATH_DEV, sep='\t')
df_dev.head(10)

Unnamed: 0,conversation_id,article_id,essay,speaker_id,gender,education,race,age,income,speaker_number,...,iri_fantasy,iri_empathatic_concern,Sadness,Anger,Surprise,Neutral,Joy,Hope,Disgust,Fear
0,1,35,How sad is it that this kind of pain and suffe...,68,2,2,1,21,20000,1,...,3.143,3.286,1,0,0,0,0,0,0,0
1,4,35,The article is kind of tragic and hits close t...,79,1,6,3,33,64000,1,...,2.429,1.429,1,0,0,0,0,0,0,0
2,7,213,"I think that these kinds of stories, are sad, ...",68,2,2,1,21,20000,1,...,3.143,3.286,1,0,0,0,0,0,0,0
3,9,213,It's crazy that random accidents like this hap...,84,2,4,1,25,55000,1,...,3.571,3.143,0,0,0,1,0,0,0,0
4,12,78,This story makes me so so sad.... As someone w...,68,2,2,1,21,20000,1,...,3.143,3.286,1,0,0,0,0,0,0,0
5,15,78,"After reading the article, my first reaction a...",70,1,6,1,29,85000,1,...,4.143,4.643,1,0,0,0,0,0,0,0
6,16,336,I didn't know coal mining had such adverse eff...,81,1,4,1,30,27000,1,...,4.571,4.0,0,0,0,1,0,0,0,0
7,20,336,This is very sad. I can't imagine having elep...,73,2,7,1,38,42000,1,...,2.571,3.857,1,0,0,0,0,0,0,0
8,23,281,"Guys, reading this article really hits home fo...",63,1,4,1,25,29000,1,...,2.571,4.857,1,0,0,0,0,0,0,0
9,26,171,Hey guys. So I just read this article about Ir...,63,1,4,1,25,29000,1,...,2.571,4.857,0,0,0,1,0,0,0,0


In [5]:
emotions_list = ["Sadness", "Anger", "Surprise", "Neutral",	"Joy", "Hope", "Disgust", "Fear"]

essays_train = df_train['essay'].to_list()
essays_dev = df_train['essay'].to_list()

labels_train = []
labels_dev = []
for emo in emotions_list:
  labels_train.append(df_train[emo].to_list())
  labels_dev.append(df_dev[emo].to_list())
#labels_train = df_train[emotions_list].to_list()

In [6]:
# Define your dataset class
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len, target_labels):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.essay = dataframe.essay
        self.targets = self.data[target_labels].values
        self.max_len = max_len

    def __len__(self):
        return len(self.essay)

    def __getitem__(self, index):
        essay = str(self.essay[index])
        essay = " ".join(essay.split())

        inputs = self.tokenizer.encode_plus(
            essay,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

# Prepare your data
#texts =   # List of essay texts
#labels = data_emo_labels  # List of corresponding emotion labels
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', truncation=True)
emotions_list = ["Sadness", "Anger", "Surprise", "Neutral",	"Joy", "Hope", "Disgust", "Fear"]
dataset = CustomDataset(df_train, tokenizer, 200, emotions_list)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Load pretrained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=8)

# Set up optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Fine-tune the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

model.train()
for epoch in range(2):
    total_loss = 0

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['targets'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    scheduler.step()

    print(f'Epoch {epoch + 1}: Average Loss = {total_loss / len(dataloader)}')

# Save the fine-tuned model
model.save_pretrained('fine_tuned_bert_emotion_classifier')
tokenizer.save_pretrained('fine_tuned_bert_emotion_classifier')

Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 225kB/s]
Downloading model.safetensors: 100%|██████████| 440M/440M [00:33<00:00, 13.0MB/s] 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

KeyboardInterrupt: 