In [None]:
!pip install datasets
!pip install transformers
!pip install sklearn

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
import numpy as np
import torch
from datasets import load_dataset, Features, Sequence, Value
from sklearn.metrics import classification_report
from transformers import BartTokenizer, BartForSequenceClassification, TrainingArguments, Trainer

dataset = load_dataset("go_emotions")

model_name = "facebook/bart-large"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForSequenceClassification.from_pretrained(
    model_name,
    num_labels=28,
    problem_type="multi_label_classification"
)
print(dataset)

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})


In [None]:
# Tokenize and one-hot encode labels
def preprocess_function(examples):
    tokenized = tokenizer(
        examples["text"], truncation=True, padding="max_length", max_length=50
    )

    # One-hot encode the labels
    num_classes = 28
    one_hot_labels = np.zeros((len(examples["labels"]), num_classes), dtype=np.float32)
    for i, labels in enumerate(examples["labels"]):
        one_hot_labels[i, labels] = 1.0

    tokenized["labels"] = one_hot_labels
    return tokenized

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [None]:
new_features = Features(
    {
        "text": Value("string"),
        "labels": Sequence(Value("float32")),
        "id": Value("string"),
        "input_ids": Sequence(Value("int32")),
        "attention_mask": Sequence(Value("int32")),
    }
)

# Cast the dataset to the new type
tokenized_dataset = tokenized_dataset.cast(new_features)

Casting the dataset:   0%|          | 0/43410 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5426 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [None]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1305,0.094437


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}


TrainOutput(global_step=679, training_loss=0.12205936280195773, metrics={'train_runtime': 335.745, 'train_samples_per_second': 129.295, 'train_steps_per_second': 2.022, 'total_flos': 4607500258404000.0, 'train_loss': 0.12205936280195773, 'epoch': 1.0})

In [None]:
# Generate a detailed classification report for the test set
predictions = trainer.predict(tokenized_dataset["test"])

In [None]:
print(predictions.predictions[0]>0.5)

[[False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]
 ...
 [False False False ... False False  True]
 [False False False ... False False False]
 [False False False ... False False False]]


In [13]:
from scipy.special import expit # Import expit (sigmoid)
from sklearn.metrics import accuracy_score, classification_report

# Extract logits and labels
logits = predictions.predictions[0]  # Assuming logits are the first element
labels = predictions.label_ids

# Apply sigmoid to get probabilities
probabilities = expit(logits)

# Apply threshold to probabilities for multilabel classification
threshold = 0.5
predicted_labels = (probabilities > threshold).astype(int)

# Generate the classification report
print("Test Set Accuracy:", accuracy_score(labels, predicted_labels))
report = classification_report(labels, predicted_labels, output_dict=False, zero_division=0)
print("Detailed Classification Report:\n", report)

Test Set Accuracy: 0.37626681407775936
Detailed Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.63      0.67       504
           1       0.79      0.87      0.83       264
           2       0.67      0.18      0.29       198
           3       0.00      0.00      0.00       320
           4       0.79      0.17      0.28       351
           5       0.75      0.09      0.16       135
           6       0.55      0.07      0.13       153
           7       0.54      0.36      0.43       284
           8       0.71      0.14      0.24        83
           9       0.00      0.00      0.00       151
          10       0.60      0.08      0.14       267
          11       1.00      0.01      0.02       123
          12       0.00      0.00      0.00        37
          13       0.00      0.00      0.00       103
          14       0.70      0.41      0.52        78
          15       0.93      0.89      0.91       352
         