### Import necessary libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

In [2]:
df = pd.read_csv('../full_dataset/goemotions_1.csv')
df.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


In [3]:
df.shape

(70000, 37)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 37 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   text                  70000 non-null  object 
 1   id                    70000 non-null  object 
 2   author                70000 non-null  object 
 3   subreddit             70000 non-null  object 
 4   link_id               70000 non-null  object 
 5   parent_id             70000 non-null  object 
 6   created_utc           70000 non-null  float64
 7   rater_id              70000 non-null  int64  
 8   example_very_unclear  70000 non-null  bool   
 9   admiration            70000 non-null  int64  
 10  amusement             70000 non-null  int64  
 11  anger                 70000 non-null  int64  
 12  annoyance             70000 non-null  int64  
 13  approval              70000 non-null  int64  
 14  caring                70000 non-null  int64  
 15  confusion          

In [5]:
df.drop(['id', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear'], axis='columns', inplace=True)

### Preprocessing

In [6]:
# Define the mapping from detailed emotions to broad categories
emotion_mapping = {
    'admiration': 'positive', 'amusement': 'positive', 'approval': 'positive', 'caring': 'positive', 'desire': 'positive',
    'excitement': 'positive', 'gratitude': 'positive', 'joy': 'positive', 'love': 'positive', 'optimism': 'positive',
    'relief': 'positive', 'pride': 'positive', 'anger': 'negative', 'annoyance': 'negative', 'disapproval': 'negative',
    'disappointment': 'negative', 'disgust': 'negative', 'embarrassment': 'negative', 'fear': 'negative', 'grief': 'negative',
    'nervousness': 'negative', 'remorse': 'negative', 'sadness': 'negative', 'surprise': 'neutral', 'realization': 'neutral',
    'neutral': 'neutral', 'curiosity': 'neutral', 'confusion': 'neutral', 'other': 'other'
}

In [7]:
# Extract emotion columns
emotion_cols = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
    'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment',
    'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism',
    'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]

In [8]:
# Map detailed emotions to broad categories
df['broad_emotion'] = df[emotion_cols].idxmax(axis=1).map(emotion_mapping)

In [9]:
# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['broad_emotion'])

In [10]:
df

Unnamed: 0,text,author,subreddit,admiration,amusement,anger,annoyance,approval,caring,confusion,...,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,broad_emotion,label
0,That game hurt.,Brdd9,nrl,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,negative,0
1,>sexuality shouldn’t be a grouping category I...,TheGreen888,unpopularopinion,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,positive,2
2,"You do right, if you don't care then fuck 'em!",Labalool,confessions,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,neutral,1
3,Man I love reddit.,MrsRobertshaw,facepalm,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,positive,2
4,"[NAME] was nowhere near them, he was by the Fa...",American_Fascist713,starwarsspeculation,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,neutral,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,"It's about fucking time, hope this is real.",DudeImMacGyver,worldnews,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,negative,0
69996,This is great! Can anyone make a request with ...,Dirkus777,gay,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,positive,2
69997,I’m sorry. Can you please explain what are the...,menjav,DebateAnAtheist,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,neutral,1
69998,No but it should be,heputmystuffinjello,DunderMifflin,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,neutral,1


In [11]:
X = df['text']
y = df['label']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Define a dataset class for PyTorch
class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [14]:
# Initialize tokenizer and create datasets
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = EmotionDataset(X_train.to_numpy(), y_train.to_numpy(), tokenizer, max_len=128)
test_dataset = EmotionDataset(X_test.to_numpy(), y_test.to_numpy(), tokenizer, max_len=128)

## Model Training

In [15]:
from torch.optim import AdamW

class CustomTrainer(Trainer):
    def create_optimizer_and_scheduler(self, num_training_steps: int):
        self.optimizer = AdamW(self.model.parameters(), lr=self.args.learning_rate)
        self.lr_scheduler = super().create_scheduler(num_training_steps)

In [16]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [17]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1000,
)

In [None]:
# Initialize custom Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1))}
)

trainer.train()

***** Running training *****
  Num examples = 56000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 10500
  Number of trainable parameters = 109484547
The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


Step,Training Loss,Validation Loss
