# Classifying the go_emotions dataset using a transformer as feature extractor: extracting the last hidden state of the [CLS] token

In [None]:
# !pip install -qq datasets transformers torch

In [1]:
from pathlib import Path

from datasets import load_dataset
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from transformers import (
    AutoModelForSequenceClassification,
    AutoModel,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
)


In [2]:
emotions = load_dataset("go_emotions", "simplified")


Reusing dataset go_emotions (/Users/alexanderjunge/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
emotions = emotions.rename_column("labels", "labels_")

In [4]:
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'labels_', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels_', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels_', 'id'],
        num_rows: 5427
    })
})

In [5]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(
    model_ckpt, problem_type="multi_label_classification"
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
labels_feature = emotions["train"].features["labels_"].feature
int2str = labels_feature.int2str
str2int = labels_feature.str2int
num_classes = labels_feature.num_classes

In [7]:
def tokenize_binarize(batch):
    # tokenize text
    encoded = tokenizer(batch["text"], padding=True, truncation=True)
    # binarize labels
    labels_encoded = [[0.0] * num_classes for _ in range(len(batch["text"]))]
    for i, labels in enumerate(batch["labels_"]):
        for label in labels:
            labels_encoded[i][label] = 1.0
    encoded["labels"] = labels_encoded
    return encoded


emotions_encoded = emotions.map(tokenize_binarize, batched=True, batch_size=None)


Loading cached processed dataset at /Users/alexanderjunge/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d/cache-140b5826227c6a3d.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at /Users/alexanderjunge/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d/cache-063a22561c98f4af.arrow


In [8]:
tokenizer.cls_token_id


101

In [9]:
tokenizer.model_input_names


['input_ids', 'attention_mask']

In [10]:
tokenizer.model_max_length


512

In [11]:
tokenizer.vocab_size


30522

In [12]:
emotions_encoded["train"][4]

{'text': 'Dirty Southern Wankers',
 'labels_': [3],
 'id': 'ed0bdzj',
 'input_ids': [101,
  6530,
  2670,
  14071,
  11451,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [13]:
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "labels", "labels_"] )


In [14]:
emotions_encoded["train"][4]


{'labels_': tensor([3]),
 'input_ids': tensor([  101,  6530,  2670, 14071, 11451,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0

In [39]:
def extract_hidden_states(batch):
    inputs = {
        k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names
    }
    # Extract last hidden state
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:, 0].cpu().numpy()}


In [12]:
hidden_output_path = Path("go_emotions_hidden.npz")
if hidden_output_path.exists():
    print(f"Loading hidden states from {hidden_output_path}")
    loaded = np.load(str(hidden_output_path), allow_pickle=True)
    X_train = loaded["X_train"]
    X_valid = loaded["X_valid"]
    X_test = loaded["X_test"]
    y_train = loaded["y_train"]
    y_valid = loaded["y_valid"]
    y_test = loaded["y_test"]
else:
    print("Extracting hidden states")
    emotions_hidden = emotions_encoded.map(
        extract_hidden_states, batched=True, batch_size=100
    )

    print(emotions_hidden["train"].column_names)

    X_train = np.array(emotions_hidden["train"]["hidden_state"])
    X_valid = np.array(emotions_hidden["validation"]["hidden_state"])
    X_test = np.array(emotions_hidden["test"]["hidden_state"])
    print(X_train.shape, X_valid.shape, X_test.shape)

    y_train = np.array(emotions_hidden["train"]["labels_"], dtype=object)
    y_valid = np.array(emotions_hidden["validation"]["labels_"], dtype=object)
    y_test = np.array(emotions_hidden["test"]["labels_"], dtype=object)
    print(y_train.shape, y_valid.shape, y_test.shape)

    np.savez_compressed(
        "go_emotions_hidden.npz",
        X_train=X_train,
        X_valid=X_valid,
        X_test=X_test,
        y_train=y_train,
        y_valid=y_valid,
        y_test=y_test,
    )


Loading hidden states from go_emotions_hidden.npz


## Shallow classifiers

In [13]:
mlb = MultiLabelBinarizer()
mlb.fit((t.tolist() for t in y_train))
y_train_mlb = mlb.transform((t.tolist() for t in y_train))
y_valid_mlb = mlb.transform((t.tolist() for t in y_valid))
y_test_mlb = mlb.transform((t.tolist() for t in y_test))


In [15]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train_mlb)


In [16]:
dummy_clf.score(X_valid, y_valid_mlb)


0.0

In [21]:
lr_clf = LogisticRegression(max_iter=3000)
lr_mo_clf = MultiOutputClassifier(lr_clf)
lr_mo_clf.fit(X_train, y_train_mlb)


In [22]:
lr_mo_clf.score(X_valid, y_valid_mlb)


0.29745669001105784

In [19]:
mlp_clf = MLPClassifier(random_state=1, max_iter=00)
mlp_clf.fit(X_train, y_train_mlb)




In [20]:
mlp_clf.score(X_valid, y_valid_mlb)


0.3092517508293402

## Fine-tuning the transformer

In [15]:
# does not work
# num_labels = 28
# model = AutoModelForSequenceClassification.from_pretrained(
#     model_ckpt, num_labels=num_classes, problem_type="multi_label_classification"
# ).to(device)

num_labels = 28
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=num_classes,
    problem_type="multi_label_classification",
    #id2label=int2str,
    #label2id=str2int,
).to(device)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

In [16]:
batch_size = 64
logging_steps = len(emotions_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-go_emotions"
training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=False,
    log_level="error",
    #load_best_model_at_end=True,
    metric_for_best_model="f1",
)


In [17]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average="micro")
    roc_auc = roc_auc_score(y_true, y_pred, average="micro")
    accuracy = accuracy_score(y_true, y_pred)
    return {"f1": f1_micro_average, "roc_auc": roc_auc, "accuracy": accuracy}


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    return multi_label_metrics(predictions=preds, labels=p.label_ids)


In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=emotions_encoded["train"],
    eval_dataset=emotions_encoded["validation"],
    tokenizer=tokenizer,
)


In [21]:
#forward pass
outputs = model(input_ids=emotions_encoded['train']['input_ids'][0].unsqueeze(0), labels=emotions_encoded['train'][0]['labels'].unsqueeze(0))
outputs

SequenceClassifierOutput(loss=tensor(0.6945, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.0899, -0.1429, -0.0323,  0.0251,  0.0442, -0.0063, -0.0305, -0.0515,
         -0.0188,  0.1309, -0.0107, -0.0904,  0.1558, -0.0173, -0.0075,  0.0347,
          0.0161, -0.0188, -0.0246, -0.0115, -0.0061, -0.0839, -0.0374, -0.0566,
         -0.0294,  0.0681, -0.0116, -0.1645]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [22]:
trainer.train()




  0%|          | 0/1358 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
from huggingface_hub import notebook_login

notebook_login()
trainer.push_to_hub(commit_message="Training completed 20220608")