In [1]:
!pip -q install datasets
!pip -q install transformers[torch]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset

from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from transformers.modeling_outputs import SequenceClassifierOutput

from datasets import load_dataset
from sklearn import metrics

In [3]:
data = load_dataset("sem_eval_2018_task_1", "subtask5.english")
labels = list(data['test'].features.keys())[2:]
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/605k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/291k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/81.3k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6838 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3259 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/886 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
class SemevalDataset(Dataset):
    def __init__(self, data):
        self.data_frame = data.to_pandas().drop(columns='ID')

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        text = self.data_frame.iloc[idx]['Tweet']
        hot_label = np.array(self.data_frame.iloc[idx, 1:].to_list()).astype(int)
        encoded_input = tokenizer(text, padding='max_length', truncation=True, max_length=64, return_tensors='pt')
        return {"input_ids": encoded_input['input_ids'].squeeze(0),
                "attention_mask": encoded_input['attention_mask'].squeeze(0), "labels": hot_label}

class BertForMultilabelSequenceClassification(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)

    def forward(self,
                input_ids=None,
                attention_mask=None,
                token_type_ids=None,
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
                labels=None,
                output_attentions=None,
                output_hidden_states=None,
                return_dict=None):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_state = outputs[0]
        pooled_output = hidden_state[:, 0]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels),
                            labels.float().view(-1, self.num_labels))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [5]:
train_dataset = SemevalDataset(data['train'])
test_dataset = SemevalDataset(data['test'])
eval_dataset = SemevalDataset(data['validation'])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
eval_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

train_losses = []
eval_losses = []
train_accuracies = []
eval_accuracies = []

In [6]:
model = BertForMultilabelSequenceClassification.from_pretrained("bert-base-cased", num_labels=11)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForMultilabelSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
batch_size = 64
# configure logging so we see training loss
logging_steps = len(train_dataset) // batch_size

args = TrainingArguments(
    output_dir="emotion",
    evaluation_strategy="epoch",
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=logging_steps,
)

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)
# trainer.train(resume_from_checkpoint=True)
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.4134,0.331091
2,0.3113,0.311314
3,0.2767,0.302451
4,0.2573,0.302699
5,0.2461,0.302452


TrainOutput(global_step=535, training_loss=0.3003715316825938, metrics={'train_runtime': 440.3587, 'train_samples_per_second': 77.641, 'train_steps_per_second': 1.215, 'total_flos': 1124561738376960.0, 'train_loss': 0.3003715316825938, 'epoch': 5.0})

In [8]:
device = "cuda"
def calc_label_metrics(label, y_targets, y_preds, threshold):
    return {
        "label": label,
        "accuracy": metrics.accuracy_score(y_targets, y_preds),
        "precision": metrics.precision_score(y_targets, y_preds, zero_division=0),
        "recall": metrics.recall_score(y_targets, y_preds, zero_division=0),
        "f1": metrics.f1_score(y_targets, y_preds, zero_division=0),
        "mcc": metrics.matthews_corrcoef(y_targets, y_preds),
        "support": y_targets.sum(),
        "threshold": threshold,
    }


threshold = 0.5
y_probas_all = []
y_targets_all = []
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        target = batch['labels']
        input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=target)
        y_probas_all.extend(outputs.logits.sigmoid().cpu().numpy())
        y_targets_all.extend(target.cpu().numpy())

y_preds_all = (np.array(y_probas_all) > threshold).astype(int)
y_targets_all = np.array(y_targets_all)

sum_precision = 0
sum_recall = 0
sum_f1 = 0
sum_mcc = 0

results = []
for label_index, label in enumerate(labels):
    y_targets, y_preds = y_targets_all[:, label_index], y_preds_all[:, label_index]
    label_metrics = calc_label_metrics(label, y_targets, y_preds, threshold)
    results.append(label_metrics)

    # Sum up metrics for macro-average
    sum_precision += label_metrics["precision"]
    sum_recall += label_metrics["recall"]
    sum_f1 += label_metrics["f1"]
    sum_mcc += label_metrics["mcc"]

# Calculate macro-average metrics
num_labels = len(labels)
macro_avg_precision = sum_precision / num_labels
macro_avg_recall = sum_recall / num_labels
macro_avg_f1 = sum_f1 / num_labels
macro_avg_mcc = sum_mcc / num_labels

# Append macro-average metrics to results
macro_avg_results = {
    "label": "macro_avg",
    "accuracy": None,  # Macro-average accuracy is not typically used
    "precision": macro_avg_precision,
    "recall": macro_avg_recall,
    "f1": macro_avg_f1,
    "mcc": macro_avg_mcc,
    "support": None,  # Support doesn't make sense for macro-average
    "threshold": threshold
}
results.append(macro_avg_results)

per_label_results = pd.DataFrame(results, index=[label["label"] for label in results])
display(per_label_results.drop(columns=["label"]).round(3))


Unnamed: 0,accuracy,precision,recall,f1,mcc,support,threshold
anger,0.855,0.797,0.765,0.781,0.673,1101.0,0.5
anticipation,0.871,0.517,0.108,0.179,0.192,425.0,0.5
disgust,0.832,0.756,0.743,0.75,0.624,1099.0,0.5
fear,0.927,0.819,0.654,0.727,0.691,485.0,0.5
joy,0.867,0.872,0.82,0.845,0.73,1442.0,0.5
love,0.886,0.68,0.535,0.599,0.539,516.0,0.5
optimism,0.814,0.737,0.732,0.735,0.592,1143.0,0.5
pessimism,0.891,0.566,0.216,0.313,0.303,375.0,0.5
sadness,0.834,0.759,0.64,0.694,0.585,960.0,0.5
surprise,0.949,0.692,0.053,0.098,0.182,170.0,0.5


In [9]:
import random

threshold = 0.1

# Randomly select a sample from the test dataset
sample_idx = random.randint(0, len(test_loader.dataset) - 1)

batch = test_loader.dataset[sample_idx]
sample_input_ids = batch['input_ids']
sample_attention_mask = batch['attention_mask']
sample_target = batch['labels']
# Move the sample to the same device as your model
sample_input_ids, sample_attention_mask = sample_input_ids.to(device), sample_attention_mask.to(device)

# Get the model's prediction for this sample
with torch.no_grad():
    sample_output = model(sample_input_ids.unsqueeze(0), attention_mask=sample_attention_mask.unsqueeze(0))
    sample_prediction = sample_output.logits.sigmoid().squeeze().cpu().numpy() > threshold
# Convert sample input ids to text if your dataset is text-based
# This conversion depends on the tokenizer you used for your model
sample_text = tokenizer.decode(sample_input_ids, skip_special_tokens=True)

# Print the sample text, true labels, and predicted labels
print("Sample Text:\n", sample_text)
print("\nTrue Labels:", sample_target)
print("Predicted Labels:", sample_prediction.astype(int))

Sample Text:
 Difficulties are meant to rouse, not discourage. The human spirit is to grow strong by conflict. William Ellery Channing

True Labels: [0 0 0 0 0 0 1 0 1 0 1]
Predicted Labels: [1 1 0 1 1 0 1 1 1 0 1]
