In [13]:
!pip install optuna

Collecting optuna
  Using cached optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Using cached alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Using cached colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Using cached optuna-4.4.0-py3-none-any.whl (395 kB)
Using cached alembic-1.16.4-py3-none-any.whl (247 kB)
Downloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.4 colorlog-6.9.0 optuna-4.4.0


1. Load Datasets(Go Emotions) + MultiLabelBinarizer

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import load_dataset

dataset = load_dataset("go_emotions", 'simplified')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/350k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [4]:
from sklearn.preprocessing import MultiLabelBinarizer

In [5]:
mlb = MultiLabelBinarizer()
mlb.fit(dataset['train']['labels'])

for split in ['train', 'test', 'validation']:
  binarized = mlb.transform(dataset[split]['labels'])
  dataset[split] = dataset[split].add_column('binarized_labels', binarized.tolist())

In [6]:
dataset['train'][0]

{'text': "My favourite food is anything I didn't have to cook myself.",
 'labels': [27],
 'id': 'eebbqej',
 'binarized_labels': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1]}

2. Tokenizer and Tokenization using DistilBert(because its more lightly and efficiently than BERT)

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW

In [8]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
def tokenize(example):
  tokens = (
      tokenizer(example['text'], padding='max_length', truncation=True, max_length=128)
  )
  tokens['labels'] = example['binarized_labels']
  return tokens

encode_dataset = dataset.map(tokenize, batched=True)

for split in ['train', 'test', 'validation']:
  encode_dataset[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

--DataLoader, to make it more efficient i set batch_size 8(for train) and 16 (for test and valid), num_workers 0

In [10]:
from torch.utils.data import DataLoader

In [11]:
train_loader = DataLoader(encode_dataset['train'], batch_size=8, shuffle=True, num_workers=0, pin_memory=True)
test_loader = DataLoader(encode_dataset['test'], batch_size=16, shuffle=False, num_workers=0, pin_memory=True)
val_loader = DataLoader(encode_dataset['validation'], batch_size=16, shuffle=False, num_workers=0, pin_memory=True)

3. Optuna for optimisation hyperparameters and tuning LoRA to make distilbert more lightly but still efficient without losing accuracy and precission

In [14]:
import optuna
import numpy as np
import torch
from torch.nn import BCEWithLogitsLoss
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import f1_score, classification_report

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def objective(trial):
  lr = trial.suggest_float('lr', 1e-5, 5e-5, log=True)
  r = trial.suggest_int('lora_r', 4, 16)
  alpha = trial.suggest_int('lora_alpha', 16, 64)

  model = AutoModelForSequenceClassification.from_pretrained(
      'distilbert-base-uncased',
      num_labels = len(mlb.classes_),
      problem_type = 'multi_label_classification'
  )

  config = LoraConfig(
      r = r,
      lora_alpha = alpha,
      task_type = TaskType.SEQ_CLS,
      lora_dropout = 0.1,
      target_modules= ['q_lin', 'v_lin']
  )

  model = get_peft_model(model, config).to(device)

  optimizer = AdamW(model.parameters(), lr=lr)

  loss_fn = BCEWithLogitsLoss()

  model.train()
  for _ in range(1):
    for batch in train_loader:
      batch = {k:v.to(device) for k, v in batch.items()}
      optimizer.zero_grad()
      outputs = model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
      loss = loss_fn(outputs.logits, batch['labels'].float())
      loss.backward()
      optimizer.step()

  model.eval()
  all_preds, all_labels = [], []
  with torch.no_grad():
    for batch in val_loader:
      batch = {k:v.to(device) for k, v in batch.items()}
      outputs = model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
      preds = (torch.sigmoid(outputs.logits) > 0.5).int().cpu().numpy()
      labels = batch['labels'].cpu().numpy()
      all_preds.extend(preds)
      all_labels.extend(labels)

  y_pred = np.concatenate(all_preds)
  y_true = np.concatenate(all_labels)
  return f1_score(y_true, y_pred, average='macro', zero_division=0)

--

*   Optuna learning or study the train data with 10 times trial
*   from that study also to know the value of best parameters or LoRA to optimising.



In [16]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials = 3)
best_params = study.best_params
print('Best Hyperparameters: ', best_params)

[I 2025-07-19 12:30:10,978] A new study created in memory with name: no-name-1018a1e2-e2bf-4071-b5af-a0c83e419798


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-07-19 12:35:41,132] Trial 0 finished with value: 0.7033734326294462 and parameters: {'lr': 1.4976818044943143e-05, 'lora_r': 4, 'lora_alpha': 47}. Best is trial 0 with value: 0.7033734326294462.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-07-19 12:41:07,490] Trial 1 finished with value: 0.7324172340144843 and parameters:

Best Hyperparameters:  {'lr': 2.6884213419214947e-05, 'lora_r': 6, 'lora_alpha': 49}


4. Retrain model using best param from optuna

In [17]:
print('Retraining model from best parameters')
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels = len(mlb.classes_),
    problem_type = 'multi_label_classification'
)

Retraining model from best parameters


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
config = LoraConfig(
    r = best_params['lora_r'],
    lora_alpha = best_params['lora_alpha'],
    task_type = TaskType.SEQ_CLS,
    lora_dropout = 0.1,
    target_modules = ['q_lin', 'v_lin']
)

model = get_peft_model(model, config).to(device)
optimizer = AdamW(model.parameters(), lr=best_params['lr'])
loss_fn = BCEWithLogitsLoss()

In [19]:
model.train()
for _ in range(3):
  for batch in train_loader:
    batch = {k:v.to(device) for k, v in batch.items()}
    optimizer.zero_grad()
    outputs = model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
    loss = loss_fn(outputs.logits, batch['labels'].float())
    loss.backward()
    optimizer.step()

In [22]:
print('Evaluating on test set...')
model.eval()
all_pred, all_true = [], []
with torch.no_grad():
  for batch in test_loader:
    batch = {k:v.to(device) for k,v in batch.items()}
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    probs = torch.sigmoid(outputs.logits)
    preds = (probs > 0.5).int().cpu().numpy()
    labels = batch['labels'].cpu().numpy()
    all_pred.append(preds)
    all_true.append(labels)

Evaluating on test set...


In [27]:
label_names = dataset['train'].features['labels'].feature.names
print("\nClassification Report on Test Set:")
print(classification_report(np.concatenate(all_true), np.concatenate(all_pred),
                            target_names=label_names, zero_division=0))


Classification Report on Test Set:
                precision    recall  f1-score   support

    admiration       0.74      0.59      0.65       504
     amusement       0.78      0.83      0.81       264
         anger       0.62      0.30      0.41       198
     annoyance       0.52      0.04      0.08       320
      approval       0.80      0.15      0.25       351
        caring       0.64      0.13      0.22       135
     confusion       0.64      0.22      0.33       153
     curiosity       0.56      0.40      0.47       284
        desire       0.66      0.30      0.41        83
disappointment       0.88      0.05      0.09       151
   disapproval       0.51      0.20      0.29       267
       disgust       0.88      0.19      0.31       123
 embarrassment       1.00      0.11      0.20        37
    excitement       0.75      0.20      0.32       103
          fear       0.79      0.49      0.60        78
     gratitude       0.94      0.89      0.91       352
         gr