In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel

import torch
import torch.nn.functional as F

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression

In [2]:
ds = load_dataset("emotion")
ds.set_format(type="pandas")
df = ds['train'][:]

def label_int2str(row):
    return ds['train'].features['label'].int2str(row)

df['label_name'] = df['label'].apply(label_int2str)
df.head()

Using custom data configuration default
Reusing dataset emotion (C:\Users\chris\.cache\huggingface\datasets\emotion\default\0.0.0\348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


Unnamed: 0,text,label,label_name
0,i didnt feel humiliated,0,sadness
1,i can go from feeling so hopeless to so damned...,0,sadness
2,im grabbing a minute to post i feel greedy wrong,3,anger
3,i am ever feeling nostalgic about the fireplac...,2,love
4,i am feeling grouchy,3,anger


In [3]:
model_ckpt = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt).to("cuda")

In [10]:
def tokenize(batch):
    if type(batch['text']) == "str":
        return tokenizer(batch['text'], padding=True, truncation=True)
    else:
        text = str(batch['text'])
        return tokenizer(text, padding=True, truncation=True)

enc_ds = ds.map(tokenize, load_from_cache_file=False)
print(enc_ds['train'].features)

100%|██████████| 16000/16000 [00:15<00:00, 1017.55ex/s]
100%|██████████| 2000/2000 [00:01<00:00, 1017.27ex/s]
100%|██████████| 2000/2000 [00:01<00:00, 1023.53ex/s]

{'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=6, names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], names_file=None, id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}





In [12]:
print(tokenizer.model_input_names)
def extract_hidden_states(batch):
    inputs = {k:torch.tensor(v).to("cuda") for k,v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        outputs = model(**inputs).last_hidden_state
    return {"hidden_state": outputs[:,0].cpu().numpy()}
    
enc_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
ds_hidden = enc_ds.map(extract_hidden_states, batched=True, batch_size=1)

['input_ids', 'token_type_ids', 'attention_mask']


  inputs = {k:torch.tensor(v).to("cuda") for k,v in batch.items() if k in tokenizer.model_input_names}
100%|██████████| 16000/16000 [01:41<00:00, 158.00ba/s]
100%|██████████| 2000/2000 [00:12<00:00, 163.95ba/s]
100%|██████████| 2000/2000 [00:12<00:00, 164.42ba/s]


In [13]:
print(ds_hidden['train'][:5])

{'attention_mask': [tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])], 'hidden_state': tensor([[-0.2062,  0.3054,  0.1314,  ...,  0.3351,  0.0351, -0.4032],
        [-0.0041,  0.3205, -0.0872,  ...,  0.1761, -0.0223, -0.3161],
        [-0.2710,  0.3522,  0.0447,  ...,  0.2467,  0.1068, -0.3156],
        [ 0.0905,  0.5637, -0.3885,  ...,  0.1472, -0.2884, -0.1310],
        [-0.0049,  0.1839, -0.0715,  ...,  0.2255, -0.0468, -0.2650]]), 'input_ids': [tensor([  101,  1014,  1045,  2134,  2102,  2514, 26608,  2171,  1024,  3793,
         1010, 26718, 18863,  1024,  4874,   102]), tensor([  101,  1014,  1045,  2064,  2175,  2013,  3110,  2061, 20625,  2000,
         2061,  963

  return np.array(array, copy=False, **self.np_array_kwargs)


In [20]:
X_train = np.array(ds_hidden['train']['hidden_state'])
X_valid = np.array(ds_hidden['validation']['hidden_state'])
Y_train = np.array(ds_hidden['train']['label'])
Y_valid = np.array(ds_hidden['validation']['label'])
print(X_train.shape, X_valid.shape)
print(Y_train.shape, Y_valid.shape)

(16000, 384) (2000, 384)
(16000,) (2000,)


In [16]:
X_scaled = MinMaxScaler().fit_transform(X_train)
mapper = TSNE(n_components=2, metric="cosine").fit(X_scaled)
df_embed = pd.DataFrame(mapper.embedding_, columns=["X","Y"])
df_embed['label'] = Y_train
df_embed.head()

Unnamed: 0,X,Y,label
0,-39.538387,3.298877,0
1,-16.601639,8.595325,0
2,-33.1698,-60.807358,3
3,28.395744,53.454262,2
4,-32.307083,43.052593,3


In [21]:
lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(X_train, Y_train)
lr_clf.score(X_valid, Y_valid)

0.561

In [49]:
X = df_embed[["X","Y"]]
lr_feature_clf = LogisticRegression(max_iter=3000)
lr_feature_clf.fit(X, df_embed['label'])
lr_feature_clf.score(X, df_embed['label'])

0.3515

In [50]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, Y_train)
dummy_clf.score(X_train, Y_train)

0.335125