In [5]:
from datasets import load_dataset

In [6]:
emotions = load_dataset("emotion")

In [7]:
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [8]:
train_ds = emotions["train"]

In [9]:
train_ds[:1]

{'text': ['i didnt feel humiliated'], 'label': [0]}

In [10]:
train_ds.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}

In [11]:
from transformers import TFAutoModel, AutoTokenizer


In [12]:
model_ckpt = 'distilbert-base-uncased'

In [13]:
tf_model = TFAutoModel.from_pretrained(model_ckpt)



AttributeError: module 'ml_dtypes' has no attribute 'float4_e2m1fn'


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [14]:
text = "this is a test"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [15]:
inputs = tokenizer(text, return_tensors='tf')

In [16]:
print(inputs['input_ids'])

tf.Tensor([[ 101 2023 2003 1037 3231  102]], shape=(1, 6), dtype=int32)


In [17]:
inputs = {k:v for k,v in inputs.items()}

In [18]:
outputs = tf_model(**inputs)

In [19]:
print(outputs)

TFBaseModelOutput(last_hidden_state=<tf.Tensor: shape=(1, 6, 768), dtype=float32, numpy=
array([[[-0.15651304, -0.18619642,  0.05277665, ..., -0.11881151,
          0.06620585,  0.54701555],
        [-0.35751337, -0.6483561 , -0.06178965, ..., -0.3040192 ,
          0.3507684 ,  0.5220682 ],
        [-0.27718478, -0.44594476,  0.18184273, ..., -0.09477935,
         -0.00757451,  0.99582815],
        [-0.28408548, -0.39167717,  0.3752554 , ..., -0.21505737,
         -0.11725175,  1.0526478 ],
        [ 0.26608253, -0.509364  , -0.31801307, ..., -0.42029804,
          0.014442  , -0.214895  ],
        [ 0.94406104,  0.01117249, -0.47139436, ...,  0.14394683,
         -0.72878313, -0.16194965]]], dtype=float32)>, hidden_states=None, attentions=None)


In [20]:
outputs.last_hidden_state[:, 0].shape

TensorShape([1, 768])

In [22]:
tokenizer.model_input_names

['input_ids', 'attention_mask']

In [23]:
def extract_hidden_state(batch):
    inputs= {k:v for k,v in batch.items() if k in tokenizer.model_input_names}
    last_hidden_state  = tf_model(**inputs).last_hidden_state
    return  {"hidden_state": last_hidden_state[:,0].numpy()}

In [24]:
def tokenize(batch):
    return tokenizer(batch["text"], padding= True, truncation=True)

In [25]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

In [26]:
import pandas as pd
pd.DataFrame(emotions_encoded["train"])

Unnamed: 0,text,label,input_ids,attention_mask
0,i didnt feel humiliated,0,"[101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,i can go from feeling so hopeless to so damned...,0,"[101, 1045, 2064, 2175, 2013, 3110, 2061, 2062...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,im grabbing a minute to post i feel greedy wrong,3,"[101, 10047, 9775, 1037, 3371, 2000, 2695, 104...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ..."
3,i am ever feeling nostalgic about the fireplac...,2,"[101, 1045, 2572, 2412, 3110, 16839, 9080, 128...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,i am feeling grouchy,3,"[101, 1045, 2572, 3110, 24665, 7140, 11714, 10...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
15995,i just had a very brief time in the beanbag an...,0,"[101, 1045, 2074, 2018, 1037, 2200, 4766, 2051...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15996,i am now turning and i feel pathetic that i am...,0,"[101, 1045, 2572, 2085, 3810, 1998, 1045, 2514...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15997,i feel strong and good overall,1,"[101, 1045, 2514, 2844, 1998, 2204, 3452, 102,...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
15998,i feel like this was such a rude comment and i...,3,"[101, 1045, 2514, 2066, 2023, 2001, 2107, 1037...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [27]:
emotions_encoded.set_format("tensorflow", columns=["input_ids", "attention_mask", "label"])

In [28]:
emotions_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [29]:
emotions_hidden = emotions_encoded.map(extract_hidden_state, batched=True)



Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [30]:
import numpy as np

In [31]:
emotions_hidden

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', 'hidden_state'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', 'hidden_state'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', 'hidden_state'],
        num_rows: 2000
    })
})

In [32]:
X_train = np.array(emotions_hidden["train"][ "hidden_state"])
X_label = np.array(emotions_hidden["train"]["label"])

In [33]:
y_train= np.array(emotions_hidden["validation"]["hidden_state"])
y_label = np.array(emotions_hidden["validation"]["label"])

In [34]:
X_train, X_label

(array([[-0.11675121,  0.09857128, -0.12962954, ...,  0.05871081,
          0.35432705,  0.40420735],
        [-0.03236267, -0.03231487, -0.19572589, ..., -0.17465726,
          0.35463768,  0.30276555],
        [ 0.03974663,  0.20223337,  0.14227116, ..., -0.11406814,
          0.33937812,  0.3958312 ],
        ...,
        [-0.00339851, -0.09585522,  0.05843524, ..., -0.04272681,
          0.24959207,  0.30761614],
        [ 0.06660251,  0.17334345,  0.12896657, ...,  0.0611862 ,
          0.29038274,  0.4684416 ],
        [ 0.01668797,  0.10127114, -0.00731698, ..., -0.06493645,
          0.34540543,  0.2199358 ]], dtype=float32),
 array([0, 0, 3, ..., 1, 3, 0], dtype=int64))

In [35]:
y_train, y_label

(array([[-0.13440834,  0.21829036,  0.12347551, ..., -0.00531261,
          0.40892625,  0.55575687],
        [ 0.08009711,  0.08259223, -0.04189613, ...,  0.05444114,
          0.32190704,  0.36251083],
        [ 0.01471848,  0.17494832,  0.04296539, ...,  0.08803076,
          0.32207602,  0.25796783],
        ...,
        [ 0.11886087,  0.09684597, -0.05445658, ...,  0.03132069,
          0.19341004,  0.34015718],
        [ 0.12177449,  0.01580077, -0.10124985, ..., -0.10977945,
          0.25836036,  0.14554884],
        [ 0.08095254, -0.02955443, -0.03955503, ..., -0.09794138,
          0.19548455,  0.212671  ]], dtype=float32),
 array([0, 0, 2, ..., 1, 1, 1], dtype=int64))

In [36]:
X_train.shape, y_train.shape, X_label.shape, y_label.shape

((16000, 768), (2000, 768), (16000,), (2000,))

In [37]:
from sklearn.preprocessing import MinMaxScaler

In [38]:
X_scaled  = MinMaxScaler().fit_transform(X_train)

In [39]:
X_scaled

array([[0.36425316, 0.5860988 , 0.39729947, ..., 0.7459279 , 0.50480574,
        0.6927483 ],
       [0.45346364, 0.45611957, 0.3350113 , ..., 0.5397175 , 0.50513655,
        0.60421926],
       [0.52969325, 0.6890426 , 0.65353453, ..., 0.5932557 , 0.4888866 ,
        0.68543833],
       ...,
       [0.48408282, 0.3930195 , 0.5745289 , ..., 0.6562948 , 0.39327317,
        0.60845244],
       [0.55808365, 0.6603529 , 0.6409965 , ..., 0.7481152 , 0.43671125,
        0.748806  ],
       [0.50531703, 0.58878   , 0.51256496, ..., 0.6366698 , 0.49530506,
        0.5319331 ]], dtype=float32)

In [40]:
num_labels = len(set(X_label))
num_labels

6

In [41]:
from transformers import TFAutoModelForSequenceClassification
tf_model = (TFAutoModelForSequenceClassification.from_pretrained(model_ckpt,num_labels= num_labels))

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [42]:
tokenizer_columns = tokenizer.model_input_names

In [43]:
tokenizer_columns

['input_ids', 'attention_mask']

In [44]:
batch_size = 32

In [45]:
tf_train_dataset = emotions_encoded["train"].to_tf_dataset(columns=tokenizer_columns, label_cols="label",batch_size=batch_size, shuffle=False)

In [46]:
tf_validation_dataset = emotions_encoded["validation"].to_tf_dataset(columns=tokenizer_columns, label_cols="label", batch_size=batch_size, shuffle= False)

In [47]:
import tensorflow as tf

In [48]:
tf_model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5),
                loss =  tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics= tf.keras.metrics.SparseCategoricalAccuracy())

In [49]:
tf_model.fit(tf_train_dataset,validation_data=tf_validation_dataset,epochs=2)

Epoch 1/2

Epoch 2/2


<tf_keras.src.callbacks.History at 0x2048d2d1eb0>

In [79]:
def forward_pass_with_label(batch):
    inputs = {k:v for k,v in batch.items() if k in tokenizer.model_input_names}
    output = tf_model(**inputs)
    pred_label= tf.argmax(output.logits, axis =-1)
    loss = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
    loss = loss(batch["label"], output.logits)
    return {"loss": loss.numpy(),
           "predicted_label": pred_label.numpy()}

In [80]:
emotions_encoded.set_format("tensorflow", columns=['input_ids', 'attention_mask', 'label'])
emotions_encoded["validation"]= emotions_encoded['validation'].map(forward_pass_with_label,batched=True, batch_size=16)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [83]:
emotions_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', 'loss', 'predicted_label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [82]:
emotions_encoded.set_format("pandas")

In [84]:
def label_int2str(row):
    return emotions["train"].features["label"].int2str(row)

In [86]:
columns = ["text", "label","predicted_label", "loss" ]
df_test = emotions_encoded["validation"][:][columns]
df_test["label"]= df_test["label"].apply(label_int2str)
df_test["predicted_label"]= (df_test["predicted_label"].apply(label_int2str))

In [88]:
df_test.sort_values("loss", ascending =False).head(10)

Unnamed: 0,text,label,predicted_label,loss
405,i have been feeling extraordinarily indecisive...,fear,joy,8.844959
1801,i feel that he was being overshadowed by the s...,love,sadness,7.735939
415,im kind of embarrassed about feeling that way ...,love,joy,5.930142
1961,i feel more well rested though my sinuses stil...,joy,sadness,5.556807
1111,im lazy my characters fall into categories of ...,joy,fear,5.374475
966,i find myself trying to discreetly smell his b...,sadness,fear,5.319673
1124,someone acting stupid in public,anger,sadness,5.212431
1367,that day i was alone at home after coming home...,fear,sadness,5.084856
1032,i began to feel woeful as i stared into the ab...,sadness,joy,4.015267
367,i feel assaulted by all directions,sadness,fear,3.934339


In [89]:
df_test.sort_values("loss", ascending =True).head(10)

Unnamed: 0,text,label,predicted_label,loss
1082,i feel the cool night air against my face,joy,joy,0.000803
71,i must say to get to this point where i feel n...,joy,joy,0.000813
1619,i sat in the car and read my book which suited...,joy,joy,0.000822
197,i feel so cool like ice t huhwe neun gatda beo...,joy,joy,0.000826
107,i feel the cool edge of the barrel against my ...,joy,joy,0.000828
876,i feel like the cool mom,joy,joy,0.000828
329,i have had my treasury selection on the front ...,joy,joy,0.000831
1657,i get up to refill my coffee and feel that ple...,joy,joy,0.000832
715,i feel mellow i feel free and i feel completel...,joy,joy,0.000833
1727,i still didnt feel satisfied with and about my...,joy,joy,0.000835


In [127]:
text = "I saw a movie and it was really awesome"

In [134]:
def prediction(text):
    tokenized_txt = tokenizer(text, return_tensors="tf")
    output = tf_model(**tokenized_txt)
    probabilities = tf.nn.softmax(output.logits, axis=-1)
    pred_label = tf.argmax(probabilities, axis=-1)
    pred_score = tf.reduce_max(probabilities, axis=-1)
    pred_label_int = pred_label
    pred_score_val = pred_score.numpy()[0]
    return {"label": label_int2str(pred_label_int), "score": pred_score_val}


In [135]:
prediction(text)

{'label': ['joy'], 'score': 0.9945387}