In [None]:
!pip install transformers datasets scikit-learn

In [None]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, hamming_loss

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
dataset = load_dataset("go_emotions")
label_list = dataset['train'].features['labels'].feature.names
num_labels = len(label_list)
mlb = MultiLabelBinarizer(classes=list(range(num_labels)))
mlb.fit(dataset['train']['labels'])
def preprocess_labels(example):
    example["labels"] = mlb.transform([example["labels"]])[0].astype(np.float32)
    return example
dataset = dataset.map(preprocess_labels)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
dataset = dataset.map(tokenize_function, batched=True)

In [None]:
def convert_to_torch(example):
    example["labels"] = torch.tensor(example["labels"], dtype=torch.float)
    return example
dataset = dataset.map(convert_to_torch)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
def compute_metrics(pred):
    logits, labels = pred
    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs > 0.5).int().numpy()
    return {
        "hamming_loss": hamming_loss(labels, preds),
        "f1_micro": f1_score(labels, preds, average='micro'),
        "f1_macro": f1_score(labels, preds, average='macro'),
    }

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
    num_labels=num_labels,
    problem_type="multi_label_classification")

In [None]:
def compute_loss(model, inputs, return_outputs=False):
    labels = inputs["labels"]
    outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
    logits = outputs.logits
    loss_fct = torch.nn.BCEWithLogitsLoss()
    loss = loss_fct(logits, labels)
    return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    compute_metrics=compute_metrics,
    compute_loss=compute_loss
)

In [None]:
trainer.train()
metrics = trainer.evaluate()
print("Evaluation Results:", metrics)