In [1]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from datasets import load_dataset, DatasetDict, Dataset, DatasetInfo
from transformers import Trainer, TrainingArguments
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the GoEmotions dataset
dataset = load_dataset("go_emotions", "raw", split='train')
dataset = dataset.train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear', 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'],
        num_rows: 168980
    })
    test: Dataset({
        features: ['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear', 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surpri

In [None]:
emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 
                'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']
num_labels = len(emotions)

In [3]:
# Tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels, problem_type="multi_label_classification")  # Adjust num_labels based on your task

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
non_useful_columns = ['id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear']

dataset_columns = dataset['train'].column_names
dataset_columns = filter(lambda x: x in emotions, dataset_columns)
dataset = dataset.remove_columns(non_useful_columns)
dataset = dataset.map(lambda x : {"labels": [x[c] for c in dataset_columns]})
dataset

Map: 100%|██████████| 168980/168980 [00:23<00:00, 7110.51 examples/s]
Map: 100%|██████████| 42245/42245 [00:05<00:00, 7191.02 examples/s]


In [5]:
def tokenize_and_encode(examples):
  return tokenizer(examples['text'], truncation=True)

dataset_columns = dataset["train"].column_names
dataset_columns.remove('labels')
encoded_dataset = dataset.map(tokenize_and_encode, batched=True, remove_columns=dataset_columns)
encoded_dataset

Map: 100%|██████████| 168980/168980 [00:11<00:00, 14277.04 examples/s]
Map: 100%|██████████| 42245/42245 [00:03<00:00, 12754.10 examples/s]


DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 168980
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 42245
    })
})

In [6]:
encoded_dataset['train'][0]

{'labels': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'input_ids': [0, 100, 2813, 14, 58, 10, 631, 8103, 27, 11936, 2],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
# cast label IDs to floats
encoded_dataset.set_format("torch")
encoded_dataset = encoded_dataset.map(lambda x : {"float_labels": torch.from_numpy(np.asarray(x["labels"])).to(torch.float)}, remove_columns=["labels"])
encoded_dataset = encoded_dataset.rename_column("float_labels", "labels")
encoded_dataset

Map: 100%|██████████| 168980/168980 [00:16<00:00, 10249.99 examples/s]
Map: 100%|██████████| 42245/42245 [00:03<00:00, 10915.49 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 168980
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 42245
    })
})

In [9]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./roberta-goemotions",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=100,
)

In [13]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer
)

In [14]:
# Train the model
trainer.evaluate()

  0%|          | 0/63369 [02:18<?, ?it/s]


ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['labels']

In [None]:
# Save the model
model.save_pretrained("./roberta-goemotions-trained")