In [23]:
%pip install scikit-multilearn datasets transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [24]:
import pandas as pd
import numpy as np
from skmultilearn.model_selection import iterative_train_test_split

In [None]:
import os
os.environ['HF_TOKEN'] = 'your_huggingface_token_here'  # replace with your actual token

In [26]:
from datasets import load_dataset

In [27]:
dataset = load_dataset("go_emotions")

In [28]:
import torch
from torch.utils.data import Dataset, DataLoader

In [29]:
def labels_to_multihot(label_indices, num_classes=28):
    vec = torch.zeros(num_classes)
    vec[label_indices] = 1
    return vec

# Example
print(labels_to_multihot(dataset['train'][0]["labels"]))

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])


In [30]:
class GoEmotionsDataset(Dataset):
    def __init__(self, hf_dataset, tokenizer, max_length):
        self.dataset = hf_dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.num_labels = 28  # 27 emotions + neutral
        
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        text = item['text']

        labels = labels_to_multihot(item['labels'], num_classes=self.num_labels)

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': labels
        }

In [31]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [32]:
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [33]:
from transformers import AutoModelForSequenceClassification

num_labels = 28  # emotions + neutral
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels,
    problem_type="multi_label_classification"
)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [35]:
train_ds = GoEmotionsDataset(dataset['train'], tokenizer, max_length=128)
val_ds = GoEmotionsDataset(dataset['validation'], tokenizer, max_length=128)
test_ds = GoEmotionsDataset(dataset['test'], tokenizer, max_length=128)

In [36]:
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=True)

In [37]:
import torch.nn as nn
import torch

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [38]:
from tqdm import tqdm

def train_one_epoch(loader):
  model.train()
  total_loss = 0
  for batch in tqdm(loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    loss = criterion(logits, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()
  return total_loss / len(loader)

def evaluate(loader):
  model.eval()
  preds, trues = [], []
  with torch.no_grad():
    for batch in tqdm(loader):
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].cpu().numpy()

      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      logits = outputs.logits

      probs = torch.sigmoid(logits).cpu().numpy()
      preds.extend((probs > 0.5).astype(int))
      trues.extend(labels)
  return preds, trues

In [39]:
from sklearn.metrics import f1_score

In [40]:
for epoch in range(3):
  train_loss = train_one_epoch(train_loader)
  preds, trues = evaluate(val_loader)
  print("Micro F1 Score:", f1_score(trues, preds, average='micro'))
  print("Macro F1 Score:", f1_score(trues, preds, average='macro'))
  print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}")

100%|██████████| 1357/1357 [14:45<00:00,  1.53it/s]
100%|██████████| 170/170 [00:38<00:00,  4.44it/s]


Micro F1 Score: 0.4455227172474636
Macro F1 Score: 0.13532264290635268
Epoch 1, Train Loss: 0.1490


100%|██████████| 1357/1357 [14:44<00:00,  1.53it/s]
100%|██████████| 170/170 [00:38<00:00,  4.46it/s]


Micro F1 Score: 0.533508011690013
Macro F1 Score: 0.30650859181146606
Epoch 2, Train Loss: 0.0935


100%|██████████| 1357/1357 [14:44<00:00,  1.53it/s]
100%|██████████| 170/170 [00:38<00:00,  4.44it/s]

Micro F1 Score: 0.5758747697974217
Macro F1 Score: 0.39501192251785955
Epoch 3, Train Loss: 0.0783





In [41]:
from sklearn.metrics import f1_score, hamming_loss

preds, trues = evaluate(val_loader)
print("Micro F1 Score:", f1_score(trues, preds, average='micro'))
print("Macro F1 Score:", f1_score(trues, preds, average='macro'))
print("Hamming Loss: ", hamming_loss(trues, preds))

100%|██████████| 170/170 [00:38<00:00,  4.43it/s]

Micro F1 Score: 0.5758747697974217
Macro F1 Score: 0.39501192251785955
Hamming Loss:  0.03031699225949134





In [42]:
preds, trues = evaluate(test_loader)
print("Test Micro F1:", f1_score(trues, preds, average="micro"))
print("Test Macro F1:", f1_score(trues, preds, average="macro"))


100%|██████████| 170/170 [00:38<00:00,  4.43it/s]

Test Micro F1: 0.5850226788854948
Test Macro F1: 0.3988502921510907





In [43]:
model.save_pretrained("./bert-goemotions_model")
tokenizer.save_pretrained("./bert-goemotions_model")


('./bert-goemotions_model/tokenizer_config.json',
 './bert-goemotions_model/special_tokens_map.json',
 './bert-goemotions_model/vocab.txt',
 './bert-goemotions_model/added_tokens.json',
 './bert-goemotions_model/tokenizer.json')

In [44]:
import shutil

# Zip the folder
shutil.make_archive('/kaggle/working/goemotions_model', 'zip', '/kaggle/working/goemotions_model')
shutil.make_archive('/kaggle/working/bert-goemotions_model', 'zip', '/kaggle/working/bert-goemotions_model')

'/kaggle/working/bert-goemotions_model.zip'

In [47]:
!curl -F "file=@goemotions_model.zip" https://temp.sh/upload

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


https://temp.sh/ebWRh/goemotions_model.zip

In [48]:
!curl -F "file=@bert-goemotions_model.zip" https://temp.sh/upload


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


https://temp.sh/VBmmz/bert-goemotions_model.zip