# Classifying the go_emotions dataset using a transformer as feature extractor: extracting the last hidden state of the [CLS] token

In [1]:
from pathlib import Path

from datasets import load_dataset
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from transformers import (
    AutoModelForSequenceClassification,
    AutoModel,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)


In [30]:
emotions = load_dataset("go_emotions", "simplified")


Reusing dataset go_emotions (/Users/alexanderjunge/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d)


  0%|          | 0/3 [00:00<?, ?it/s]

In [31]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(
    model_ckpt, problem_type="multi_label_classification"
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)


In [32]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)


Loading cached processed dataset at /Users/alexanderjunge/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d/cache-c7804d86c4774f9a.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at /Users/alexanderjunge/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d/cache-bf0d75f0049b32b8.arrow


In [33]:
tokenizer.cls_token_id


101

In [34]:
tokenizer.model_input_names


['input_ids', 'attention_mask']

In [35]:
tokenizer.model_max_length


512

In [36]:
tokenizer.vocab_size


30522

In [37]:
emotions_encoded["train"][4]


{'text': 'Dirty Southern Wankers',
 'labels': [3],
 'id': 'ed0bdzj',
 'input_ids': [101,
  6530,
  2670,
  14071,
  11451,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [38]:
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


In [39]:
def extract_hidden_states(batch):
    inputs = {
        k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names
    }
    # Extract last hidden state
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:, 0].cpu().numpy()}


In [12]:
hidden_output_path = Path("go_emotions_hidden.npz")
if hidden_output_path.exists():
    print(f"Loading hidden states from {hidden_output_path}")
    loaded = np.load(str(hidden_output_path), allow_pickle=True)
    X_train = loaded["X_train"]
    X_valid = loaded["X_valid"]
    X_test = loaded["X_test"]
    y_train = loaded["y_train"]
    y_valid = loaded["y_valid"]
    y_test = loaded["y_test"]
else:
    print("Extracting hidden states")
    emotions_hidden = emotions_encoded.map(
        extract_hidden_states, batched=True, batch_size=100
    )

    print(emotions_hidden["train"].column_names)

    X_train = np.array(emotions_hidden["train"]["hidden_state"])
    X_valid = np.array(emotions_hidden["validation"]["hidden_state"])
    X_test = np.array(emotions_hidden["test"]["hidden_state"])
    print(X_train.shape, X_valid.shape, X_test.shape)

    y_train = np.array(emotions_hidden["train"]["labels"], dtype=object)
    y_valid = np.array(emotions_hidden["validation"]["labels"], dtype=object)
    y_test = np.array(emotions_hidden["test"]["labels"], dtype=object)
    print(y_train.shape, y_valid.shape, y_test.shape)

    np.savez_compressed(
        "go_emotions_hidden.npz",
        X_train=X_train,
        X_valid=X_valid,
        X_test=X_test,
        y_train=y_train,
        y_valid=y_valid,
        y_test=y_test,
    )


Loading hidden states from go_emotions_hidden.npz


## Shallow classifiers

In [13]:
mlb = MultiLabelBinarizer()
mlb.fit((t.tolist() for t in y_train))
y_train_mlb = mlb.transform((t.tolist() for t in y_train))
y_valid_mlb = mlb.transform((t.tolist() for t in y_valid))
y_test_mlb = mlb.transform((t.tolist() for t in y_test))


In [15]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train_mlb)


In [16]:
dummy_clf.score(X_valid, y_valid_mlb)


0.0

In [21]:
lr_clf = LogisticRegression(max_iter=3000)
lr_mo_clf = MultiOutputClassifier(lr_clf)
lr_mo_clf.fit(X_train, y_train_mlb)


In [22]:
lr_mo_clf.score(X_valid, y_valid_mlb)


0.29745669001105784

In [19]:
mlp_clf = MLPClassifier(random_state=1, max_iter=00)
mlp_clf.fit(X_train, y_train_mlb)




In [20]:
mlp_clf.score(X_valid, y_valid_mlb)


0.3092517508293402

## Fine-tuning the transformer

In [40]:
# FXIME - not sure if these apply 1:1 to multilabel classification
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}


In [41]:
num_labels = 28
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt, num_labels=num_labels, problem_type="multi_label_classification"
).to(device)


In [42]:
batch_size = 64
logging_steps = len(emotions_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-go_emotions"
training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=False,
    log_level="error",
)


In [45]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=emotions_encoded["train"],
    eval_dataset=emotions_encoded["validation"],
    tokenizer=tokenizer,
)


In [46]:
trainer.train()


  0%|          | 0/679 [00:00<?, ?it/s]

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length.