In [1]:
!pip install datasets
!git lfs install
!pip install transformers evaluate accelerate
!pip3 install torch torchvision torchaudio

Git LFS initialized.


In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
import numpy as np
import evaluate
from evaluate import load
from transformers import TrainingArguments, Trainer

In [3]:
!git clone "https://huggingface.co/datasets/SetFit/emotion"

fatal: destination path 'emotion' already exists and is not an empty directory.


In [4]:
data_frame = load_dataset("./emotion")
data_frame

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2000
    })
})

In [5]:
data_frame['train'][0]

{'text': 'i didnt feel humiliated', 'label': 0, 'label_text': 'sadness'}

In [6]:
data_frame['train'].to_pandas().head(10)

Unnamed: 0,text,label,label_text
0,i didnt feel humiliated,0,sadness
1,i can go from feeling so hopeless to so damned...,0,sadness
2,im grabbing a minute to post i feel greedy wrong,3,anger
3,i am ever feeling nostalgic about the fireplac...,2,love
4,i am feeling grouchy,3,anger
5,ive been feeling a little burdened lately wasn...,0,sadness
6,ive been taking or milligrams or times recomme...,5,surprise
7,i feel as confused about life as a teenager or...,4,fear
8,i have been with petronas for years i feel tha...,1,joy
9,i feel romantic too,2,love


In [7]:
dataset = load_dataset("SetFit/emotion", split="train[:]+validation[:]")
print(dataset)

Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['text', 'label', 'label_text'],
    num_rows: 18000
})


In [8]:
data = dataset.train_test_split(test_size=0.2)
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 14400
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 3600
    })
})

In [9]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
if tokenizer.pad_token == None:
  tokenizer.pad_token = tokenizer.eos_token

In [10]:
def preprocess_function(p):
  return tokenizer(p["text"], truncation=True, padding=True)

In [11]:
tokenized_data = data.map(preprocess_function, batched=True)

Map:   0%|          | 0/14400 [00:00<?, ? examples/s]

Map:   0%|          | 0/3600 [00:00<?, ? examples/s]

In [12]:
tokenized_data["train"][0]

{'text': 'i refers of course though i cant help feeling somehow ironically in retrospect to loudons son with kate mcgarrigle the rather talented himself rufus wainwright',
 'label': 1,
 'label_text': 'joy',
 'input_ids': [72,
  10229,
  286,
  1781,
  996,
  1312,
  18548,
  1037,
  4203,
  7599,
  32532,
  287,
  23583,
  284,
  7812,
  684,
  3367,
  351,
  479,
  378,
  36650,
  70,
  3258,
  328,
  293,
  262,
  2138,
  12356,
  2241,
  374,
  3046,
  385,
  266,
  391,
  29995,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [13]:
id2label = {0:"sadness", 1:"joy", 2:"love", 3:"anger", 4:"fear", 5:"surprise"}
label2id = {"sadness":0, "joy":1, "love":2, "anger":3, "fear":4, "surprise":5}

In [14]:
model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=6, id2label=id2label, label2id=label2id)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
np.unique(data['train']['label'], return_counts=True)

(array([0, 1, 2, 3, 4, 5]),
 array([4180, 4827, 1160, 1970, 1741,  522], dtype=int64))

In [17]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)  # Convert logits to predicted class indices
    metric = load("f1")

    # Compute F1 scores for all classes
    f1_scores = metric.compute(predictions=predictions, references=labels, average=None)

    # Extract scores as a dictionary
    class_wise_f1 = {f"class_{i}": score for i, score in enumerate(f1_scores["f1"])}

    # Optionally, calculate the macro average (mean of F1 scores across all classes)
    macro_f1 = np.mean(f1_scores["f1"])

    # Combine both into the result
    class_wise_f1["macro_f1"] = macro_f1

    return class_wise_f1

In [18]:
model.config.pad_token_id = model.config.eos_token_id
print(model.config)

GPT2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "sadness",
    "1": "joy",
    "2": "love",
    "3": "anger",
    "4": "fear",
    "5": "surprise"
  },
  "initializer_range": 0.02,
  "label2id": {
    "anger": 3,
    "fear": 4,
    "joy": 1,
    "love": 2,
    "sadness": 0,
    "surprise": 5
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50256,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj"

In [19]:
training_args = TrainingArguments(
    output_dir = "best_of_best_model",
    learning_rate = 2e-5,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 8,
    num_train_epochs = 5,
    weight_decay = 0.01,
    evaluation_strategy = "epoch",
    logging_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
    push_to_hub = False,
)



In [20]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_data["train"],
    eval_dataset = tokenized_data["test"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

  trainer = Trainer(


In [26]:
trainer.train()

Epoch,Training Loss,Validation Loss,Class 0,Class 1,Class 2,Class 3,Class 4,Class 5,Macro F1
1,1.063,0.324596,0.913931,0.933667,0.783217,0.861991,0.862454,0.769231,0.854082
2,0.2453,0.196882,0.957975,0.944796,0.839937,0.912281,0.897347,0.819549,0.895314
3,0.1648,0.175626,0.966635,0.949249,0.834846,0.932136,0.898765,0.804348,0.897663
4,0.1367,0.167206,0.968914,0.948454,0.832765,0.924195,0.903654,0.784483,0.893744
5,0.1123,0.159845,0.969033,0.9525,0.858553,0.928131,0.902635,0.788618,0.899912


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

TrainOutput(global_step=2250, training_loss=0.3444014231363932, metrics={'train_runtime': 16952.8339, 'train_samples_per_second': 4.247, 'train_steps_per_second': 0.133, 'total_flos': 3111862726950912.0, 'train_loss': 0.3444014231363932, 'epoch': 5.0})