# Classifying the go_emotions dataset using a transformer as feature extractor: extracting the last hidden state of the [CLS] token

In [1]:
from pathlib import Path

from datasets import load_dataset
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from transformers import AutoModel, AutoTokenizer


In [2]:
emotions = load_dataset("go_emotions")


No config specified, defaulting to: go_emotions/simplified
Reusing dataset go_emotions (/Users/alexanderjunge/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)


Loading cached processed dataset at /Users/alexanderjunge/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d/cache-7174f90f64318b69.arrow
Loading cached processed dataset at /Users/alexanderjunge/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d/cache-ffbaf71060cee16c.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

In [5]:
tokenizer.cls_token_id


101

In [6]:
tokenizer.model_input_names


['input_ids', 'attention_mask']

In [7]:
tokenizer.model_max_length


512

In [8]:
tokenizer.vocab_size


30522

In [9]:
emotions_encoded["train"][4]


{'text': 'Dirty Southern Wankers',
 'labels': [3],
 'id': 'ed0bdzj',
 'input_ids': [101,
  6530,
  2670,
  14071,
  11451,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [10]:
def extract_hidden_states(batch):
    inputs = {
        k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names
    }
    # Extract last hidden state
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:, 0].cpu().numpy()}


In [11]:
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


In [12]:
hidden_output_path = Path("go_emotions_hidden.npz")
if hidden_output_path.exists():
    print(f"Loading hidden states from {hidden_output_path}")
    loaded = np.load(str(hidden_output_path), allow_pickle=True)
    X_train = loaded["X_train"]
    X_valid = loaded["X_valid"]
    X_test = loaded["X_test"]
    y_train = loaded["y_train"]
    y_valid = loaded["y_valid"]
    y_test = loaded["y_test"]
else:
    print("Extracting hidden states")
    emotions_hidden = emotions_encoded.map(
        extract_hidden_states, batched=True, batch_size=100
    )

    print(emotions_hidden["train"].column_names)

    X_train = np.array(emotions_hidden["train"]["hidden_state"])
    X_valid = np.array(emotions_hidden["validation"]["hidden_state"])
    X_test = np.array(emotions_hidden["test"]["hidden_state"])
    print(X_train.shape, X_valid.shape, X_test.shape)

    y_train = np.array(emotions_hidden["train"]["labels"], dtype=object)
    y_valid = np.array(emotions_hidden["validation"]["labels"], dtype=object)
    y_test = np.array(emotions_hidden["test"]["labels"], dtype=object)
    print(y_train.shape, y_valid.shape, y_test.shape)

    np.savez_compressed(
        "go_emotions_hidden.npz",
        X_train=X_train,
        X_valid=X_valid,
        X_test=X_test,
        y_train=y_train,
        y_valid=y_valid,
        y_test=y_test,
    )

Loading hidden states from go_emotions_hidden.npz


In [13]:
mlb = MultiLabelBinarizer()
mlb.fit((t.tolist() for t in y_train))
y_train_mlb = mlb.transform((t.tolist() for t in y_train))
y_valid_mlb = mlb.transform((t.tolist() for t in y_valid))
y_test_mlb = mlb.transform((t.tolist() for t in y_test))

In [14]:
y_train_mlb[:1][:]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1]])

In [15]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train_mlb)


In [16]:
dummy_clf.score(X_valid, y_valid_mlb)

0.0

In [21]:
lr_clf = LogisticRegression(max_iter=3000)
lr_mo_clf = MultiOutputClassifier(lr_clf)
lr_mo_clf.fit(X_train, y_train_mlb)

In [18]:
lr_mo_clf.score(X_valid, y_valid_mlb)

0.29708809436048655

In [19]:
mlp_clf = MLPClassifier(random_state=1, max_iter=300)
mlp_clf.fit(X_train, y_train_mlb)




In [20]:
mlp_clf.score(X_valid, y_valid_mlb)

0.3092517508293402