Install dependencies

In [None]:
!pip install transformers
!pip install datasets
!pip install peft
!pip install evaluate

Import dependencies


In [2]:
import evaluate
import numpy as np
import torch
from datasets import load_dataset, DatasetDict, Dataset
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer

Load the Hugging Face dataset 'emotion'

In [None]:
dataset = load_dataset("emotion")

Explore the dataset, consisting of train, validation and test dataset

In [4]:
display(dataset.shape)

train = dataset['train']
emotion_labels = set()

for labels in train:
  emotion_labels.add(labels['label'])

print("Labels in the dataset: ", emotion_labels)

{'train': (16000, 2), 'validation': (2000, 2), 'test': (2000, 2)}

Labels in the dataset:  {0, 1, 2, 3, 4, 5}


From https://huggingface.co/datasets/dair-ai/emotion dataset documentation, we know what the labels represent:

0: sadness, 1: joy, 2: love, 3: anger, 4: fear, 5: surprise

Another way to approach this is by checking the features

In [5]:
dataset['train'].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}

Let's know look at an example:

In [6]:
dataset['train'][9]

{'text': 'i feel romantic too', 'label': 2}

Load DistilBERT transformer model

In [7]:
model_checkpoint = 'distilbert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=6)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Check the model properties

In [8]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

Import the tokenizer and create a function to tokenize the whole dataset

In [9]:
def tokenize(rows):
    return tokenizer(rows['text'], padding="max_length", truncation=True)

model_tokenizer = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_tokenizer)

dataset.set_format(type=None)

tokenized_datasets = dataset.map(tokenize, batched=True)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenize our example

In [11]:
print("Example: ", dataset['train'][9]['text'])
tokenized_example = tokenizer(dataset['train'][9]['text'])
print("Tokenized example: ", tokenized_example)

Example:  i feel romantic too
Tokenized example:  {'input_ids': [101, 1045, 2514, 6298, 2205, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}


Initiate a data collator with padding

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Define the evaluation metrics

In [13]:
accuracy = evaluate.load("accuracy")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Define the PeftModel parameters

In [15]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

Check the parameters of the initiated PeftModel



In [16]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False)

Create a PeftModel from the configuration and base model

In [17]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 632,070 || all params: 67,590,156 || trainable%: 0.9351509708011326


Define the training arguments

In [19]:
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6057,0.515942,{'accuracy': 0.8325}
2,0.4814,0.444906,{'accuracy': 0.8665}
3,0.4182,0.395972,{'accuracy': 0.8865}
4,0.4073,0.383062,{'accuracy': 0.896}
5,0.37,0.373954,{'accuracy': 0.8965}


Trainer is attempting to log a value of "{'accuracy': 0.8325}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.8665}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.8865}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.896}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.8965}" of type <class 'dict'> for key "eval/accuracy" as a scalar. Th

TrainOutput(global_step=20000, training_loss=0.5122261444091797, metrics={'train_runtime': 3134.893, 'train_samples_per_second': 25.519, 'train_steps_per_second': 6.38, 'total_flos': 1.075348537344e+16, 'train_loss': 0.5122261444091797, 'epoch': 5.0})

Finally, evaluate your model on a completely new test dataset

In [24]:
test_dataset = tokenized_datasets["test"]

results = trainer.evaluate(eval_dataset=test_dataset)

print(results)

Trainer is attempting to log a value of "{'accuracy': 0.8895}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.377439945936203, 'eval_accuracy': {'accuracy': 0.8895}, 'eval_runtime': 34.1385, 'eval_samples_per_second': 58.585, 'eval_steps_per_second': 14.646, 'epoch': 5.0}
