In [1]:
import pandas as pd

In [2]:
import matplotlib.pyplot as plt

In [3]:
#load twitter evaluation dataset from huggging face hub
from datasets import load_dataset
dataset = load_dataset("tweet_eval", "emotion")

Reusing dataset tweet_eval (C:\Users\hp\.cache\huggingface\datasets\tweet_eval\emotion\1.1.0\12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
#see some dataset details
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 374
    })
})


In [5]:
#selecting the training dataset
train = dataset["train"]
print(train)

Dataset({
    features: ['text', 'label'],
    num_rows: 3257
})


In [6]:
#see the train features
print(train.features)

{'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=4, names=['anger', 'joy', 'optimism', 'sadness'], id=None)}


In [7]:
#convert this to a pandas dataframe
dataset.set_format("pandas")
train_df = pd.DataFrame(dataset["train"][:])

In [8]:
train_df.head()

Unnamed: 0,text,label
0,“Worry is a down payment on a problem you may ...,2
1,My roommate: it's okay that we can't spell bec...,0
2,No but that's so cute. Atsu was probably shy a...,1
3,Rooneys fucking untouchable isn't he? Been fuc...,0
4,it's pretty depressing when u hit pan on ur fa...,3


In [9]:
#check for null values
train_df.isnull().sum()

text     0
label    0
dtype: int64

In [10]:
#convert the label inteers to corresponding label names
def label_int2str(x):
    return dataset["train"].features["label"].int2str(x)

In [11]:
#add the label name
train_df["label_name"] = train_df["label"].apply(label_int2str)

In [12]:
train_df.head()

Unnamed: 0,text,label,label_name
0,“Worry is a down payment on a problem you may ...,2,optimism
1,My roommate: it's okay that we can't spell bec...,0,anger
2,No but that's so cute. Atsu was probably shy a...,1,joy
3,Rooneys fucking untouchable isn't he? Been fuc...,0,anger
4,it's pretty depressing when u hit pan on ur fa...,3,sadness


In [13]:
#check the distribution of the labels and see how imbalanced it is
train_df["label_name"].value_counts(ascending = True)

optimism     294
joy          708
sadness      855
anger       1400
Name: label_name, dtype: int64

In [14]:
#convert the dataset back to type that is suitable for HuggingFace tokenisers and models
dataset.reset_format()

In [30]:
#get the three datasets: train, validation and test
train = dataset["train"]
validation = dataset["validation"]
test = dataset["test"]

### Tokenizer

In [15]:
from transformers import AutoTokenizer

#will be using the Distill-Bert model
model_ckpt = "distilbert-base-uncased"

#get the tokenizer associated with this model
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = model_ckpt)

In [16]:
#check important information about the tokeniser
print(f"Vocab size is: {tokenizer.vocab_size}")
print(f"Model max lengh is: {tokenizer.model_max_length}")
print(f"Model input names are: {tokenizer.model_input_names}")

Vocab size is: 30522
Model max lengh is: 512
Model input names are: ['input_ids', 'attention_mask']


In [17]:
#see if tokenizer is working
text = "This is an example of tokenization"
output = tokenizer(text)
tokens = tokenizer.convert_ids_to_tokens(output['input_ids'])

In [18]:
print(f"Tokenized output: {output}")
print()
print(f"Tokenized tokens: {tokens}")
print()
print(f"Tokenzied text: {tokenizer.convert_tokens_to_string(tokens)}")

Tokenized output: {'input_ids': [101, 2023, 2003, 2019, 2742, 1997, 19204, 3989, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

Tokenized tokens: ['[CLS]', 'this', 'is', 'an', 'example', 'of', 'token', '##ization', '[SEP]']

Tokenzied text: [CLS] this is an example of tokenization [SEP]


In [19]:
#tokenize the entire dataset
def tokenize(batch):
    return tokenizer(batch["text"], padding = True, truncation = True)

tokenized_dataset = dataset.map(tokenize, batched = True, batch_size = None)

Loading cached processed dataset at C:\Users\hp\.cache\huggingface\datasets\tweet_eval\emotion\1.1.0\12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343\cache-04ef56d34ce54f9e.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at C:\Users\hp\.cache\huggingface\datasets\tweet_eval\emotion\1.1.0\12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343\cache-8b08a90b450f515e.arrow


In [20]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 374
    })
})

In [21]:
#create a pandas df and see what's there now
check_tokens = tokenized_dataset.copy()
check_tokens_df = pd.DataFrame(check_tokens["train"][:])
check_tokens_df.head()

Unnamed: 0,text,label,input_ids,attention_mask
0,“Worry is a down payment on a problem you may ...,2,"[101, 1523, 4737, 2003, 1037, 2091, 7909, 2006...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,My roommate: it's okay that we can't spell bec...,0,"[101, 2026, 18328, 1024, 2009, 1005, 1055, 310...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,No but that's so cute. Atsu was probably shy a...,1,"[101, 2053, 2021, 2008, 1005, 1055, 2061, 1014...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,Rooneys fucking untouchable isn't he? Been fuc...,0,"[101, 24246, 2015, 8239, 19662, 10875, 3085, 3...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,it's pretty depressing when u hit pan on ur fa...,3,"[101, 2009, 1005, 1055, 3492, 2139, 24128, 204...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


### Training the text classifier

In [22]:
from transformers import TFAutoModelForSequenceClassification

num_labels = 4
model = TFAutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels = num_labels)

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'pre_classifier', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use i

In [23]:
#feed the proceeded token_ids into the model in batches and apply padding to make them same length
batch_size = 64
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer = tokenizer, return_tensors = "tf")

In [31]:
#now create the tensorflow datasets from tokenized dataset
tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns = ["input_ids", "attention_mask"],
    label_cols = ["label"],
    shuffle = True,
    batch_size = batch_size,
    collate_fn = data_collator
)

tf_valid_dataset = tokenized_dataset["validation"].to_tf_dataset(
    columns = ["input_ids", "attention_mask"],
    label_cols = ["label"],
    shuffle = False,
    batch_size = batch_size,
    collate_fn = data_collator
)

In [32]:
#import necessary imports for training
import tensorflow as tf
from tensorflow.keras import optimizers
from tensorflow.keras import metrics
from tensorflow.keras import losses

In [34]:
#compile the model
model.compile(
    optimizer = optimizers.Adam(learning_rate = 5e-5),
    loss = losses.SparseCategoricalCrossentropy(from_logits = True),
    metrics = metrics.SparseCategoricalAccuracy()
)

In [35]:
#fit the model for 5 epochs
model.fit(tf_train_dataset,
          validation_data = tf_valid_dataset,
          epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1f702bd8cd0>

In [36]:
#test the model on some sentences
outputs = model.predict(tokenizer("I am feeling very happy")["input_ids"])
outputs["logits"][0].tolist()

[-1.9897310733795166,
 4.209113597869873,
 -1.9471944570541382,
 -0.4496954679489136]

In [37]:
#apply softmax and pick the label with the maximum probability
import numpy as np

label_int = np.argmax(tf.keras.layers.Softmax()(outputs["logits"][0].tolist()))
print(label_int.item())

1


In [38]:
print(label_int2str(label_int.item()))

joy
