In [35]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss, f1_score, classification_report
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from torch import nn
from datasets import Dataset

In [36]:
# Load the dataset from CSV
df = pd.read_csv('/content/go_emotions_dataset.csv')

In [38]:
# Define emotion columns (columns 3 to 30)
emotion_columns = df.columns[3:31]  # From 'admiration' to 'neutral'
print("\nEmotion labels:", list(emotion_columns))



Emotion labels: ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [39]:
# Filter out very unclear examples if needed
df = df[~df['example_very_unclear']]

In [40]:
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [41]:
def compute_class_weights(df, emotion_columns):
    # Count positive samples per class
    positive_counts = df[emotion_columns].sum().values
    total_samples = len(df)

    # Calculate weights (inverse frequency)
    weights = (total_samples - positive_counts) / positive_counts
    return torch.tensor(weights, dtype=torch.float32)

class_weights = compute_class_weights(train_df, emotion_columns)
print("Class weights:", class_weights)

Class weights: tensor([ 11.0952,  21.3215,  24.7787,  14.3303,  10.8537,  33.9265,  27.3510,
         20.5637,  54.2694,  23.3830,  17.2338,  38.6481,  82.4590,  35.9211,
         63.9705,  16.7835, 309.8312,  25.0184,  24.6333, 113.9044,  22.6151,
        156.4340,  22.6843, 157.9825,  80.1769,  30.1764,  36.6668,   2.7529])


In [42]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [57]:
def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=128
    )
    # Include emotion labels in the output
    tokenized_inputs['labels'] = [
        [float(label) for label in example]
        for example in zip(*[examples[col] for col in emotion_columns])
    ]
    return tokenized_inputs

In [58]:
# Convert to HuggingFace Dataset format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/145469 [00:00<?, ? examples/s]

Map:   0%|          | 0/31172 [00:00<?, ? examples/s]

Map:   0%|          | 0/31173 [00:00<?, ? examples/s]

In [59]:
# Model Setup
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(emotion_columns),
    problem_type="multi_label_classification"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = nn.BCEWithLogitsLoss(pos_weight=class_weights.to(model.device))
        loss = loss_fct(logits, labels.float())

        return (loss, outputs) if return_outputs else loss

In [61]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',
    load_best_model_at_end=True,
    logging_dir='./logs',
)



In [62]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()

    return {
        'hamming_loss': hamming_loss(labels, preds),
        'micro_f1': f1_score(labels, preds, average='micro'),
        'macro_f1': f1_score(labels, preds, average='macro'),
    }

In [63]:
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

In [64]:
trainer.train()

Epoch,Training Loss,Validation Loss,Hamming Loss,Micro F1,Macro F1,Runtime,Samples Per Second,Steps Per Second
1,0.848,0.846073,0.156501,0.298701,0.267878,201.4484,154.739,4.84
2,0.778,0.833923,0.161135,0.29157,0.26715,201.4274,154.756,4.84
3,0.6854,0.872866,0.14432,0.308367,0.276646,202.4838,153.948,4.815


TrainOutput(global_step=27276, training_loss=0.7895425387291741, metrics={'train_runtime': 10053.5323, 'train_samples_per_second': 43.408, 'train_steps_per_second': 2.713, 'total_flos': 2.8712577784679424e+16, 'train_loss': 0.7895425387291741, 'epoch': 3.0})

In [65]:
results = trainer.evaluate(tokenized_test)
print("Test Results:", results)

Test Results: {'eval_loss': 0.834087610244751, 'eval_hamming_loss': 0.1607423548767019, 'eval_micro_f1': 0.2921675957924476, 'eval_macro_f1': 0.2696471397311078, 'eval_runtime': 203.0289, 'eval_samples_per_second': 153.54, 'eval_steps_per_second': 4.802, 'epoch': 3.0}


In [66]:
# Save the model and tokenizer
model_save_path = "/content/emotion_model"

# Save model
model.save_pretrained(model_save_path)
# Save tokenizer
tokenizer.save_pretrained(model_save_path)

('/content/emotion_model/tokenizer_config.json',
 '/content/emotion_model/special_tokens_map.json',
 '/content/emotion_model/vocab.txt',
 '/content/emotion_model/added_tokens.json')

In [67]:
def predict_emotion(text, threshold=0.5):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.sigmoid(outputs.logits)
    preds = (probs > threshold).cpu().numpy().flatten()

    predicted_emotions = [emotion for emotion, pred in zip(emotion_columns, preds) if pred]
    return predicted_emotions, probs.cpu().numpy()

# Example
text = "I'm so excited and happy about this!"
emotions, probs = predict_emotion(text)
print("Predicted emotions:", emotions)
print("Probabilities:", probs)

Predicted emotions: ['excitement', 'joy']
Probabilities: [[0.42705435 0.27963305 0.04662031 0.05844232 0.300133   0.18249302
  0.0356713  0.18795799 0.2912654  0.04049273 0.02261861 0.02258206
  0.01713593 0.98679113 0.0411796  0.45091462 0.01296214 0.96880364
  0.21983965 0.0674279  0.33933038 0.22922274 0.21101178 0.23388527
  0.01286664 0.03073034 0.34624988 0.15313601]]
