In [None]:
import numpy as np
from datasets import load_dataset, Features, Sequence, Value
from sklearn.metrics import classification_report, accuracy_score
from scipy.special import expit
from transformers import BartTokenizer, BartForSequenceClassification, TrainingArguments, Trainer

dataset = load_dataset("go_emotions")

model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForSequenceClassification.from_pretrained(
    model_name,
    num_labels=28,
    problem_type="multi_label_classification"
)
print(dataset)
print(model)

In [None]:
# Tokenize and one-hot encode labels
def preprocess_function(examples):
    tokenized = tokenizer(
        examples["text"], truncation=True, padding="max_length", max_length=50
    )

    # One-hot encode the labels
    num_classes = 28
    one_hot_labels = np.zeros((len(examples["labels"]), num_classes), dtype=np.float32)
    for i, labels in enumerate(examples["labels"]):
        one_hot_labels[i, labels] = 1.0

    tokenized["labels"] = one_hot_labels
    return tokenized

tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
new_features = Features(
    {
        "text": Value("string"),
        "labels": Sequence(Value("float32")),
        "id": Value("string"),
        "input_ids": Sequence(Value("int32")),
        "attention_mask": Sequence(Value("int32")),
    }
)

# Cast the dataset to the new type
tokenized_dataset = tokenized_dataset.cast(new_features)

In [None]:
# Explore the train and validation loss under different batch sizes, learning rates and epochs
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
    fp16=True,
    report_to="none",
)


# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
)

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
import matplotlib.pyplot as plt

log_history = trainer.state.log_history

train_losses = []
eval_losses = []
steps = []

for log in log_history:
    if "loss" in log:
        train_losses.append(log["loss"])
        steps.append(log["step"])
    if "eval_loss" in log:
        eval_losses.append(log["eval_loss"])

# Plot loss curves
plt.figure(figsize=(10, 6))
plt.plot(steps, train_losses, label="Training Loss", marker="o")
if eval_losses:
    plt.plot(steps[:len(eval_losses)], eval_losses, label="Validation Loss", marker="o")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Training and Validation Loss")
plt.legend()
plt.grid()
plt.show()

In [None]:
# Generate a detailed classification report for the test set
predictions = trainer.predict(tokenized_dataset["test"])

In [None]:
# Extract logits and labels
logits = predictions.predictions[0]
labels = predictions.label_ids

# Apply sigmoid to get probabilities
probabilities = expit(logits)

# Apply threshold to probabilities for multilabel classification
threshold = 0.5
predicted_labels = (probabilities > threshold).astype(int)

# Generate the classification report
print("Test Set Accuracy:", accuracy_score(labels, predicted_labels))
report = classification_report(labels, predicted_labels, output_dict=False, zero_division=0)
print("Detailed Classification Report:\n", report)