In [1]:
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

MODEL='tabularisai/multilingual-sentiment-analysis'

In [2]:
ds = load_dataset('Yelp/yelp_review_full')
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [3]:
# create a 10% random subsample of the original train split
small_subset = ds['train'].train_test_split(train_size=0.1, seed=42)['train']

# split that subsample into 80% train / 20% validation
split = small_subset.train_test_split(test_size=0.2, seed=42)
small_train = split['train']
small_val = split['test']  # validation set

# quick sanity checks
print('original train:', len(ds['train']))
print('subsample (10%):', len(small_subset))
print('small train (80% of subsample):', len(small_train))
print('small val (20% of subsample):', len(small_val))

original train: 650000
subsample (10%): 65000
small train (80% of subsample): 52000
small val (20% of subsample): 13000


In [4]:
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def model_init(trial):
    return AutoModelForSequenceClassification.from_pretrained(MODEL)

def hp_space(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 5e-6, 5e-4, log=True),
        'per_device_train_batch_size': trial.suggest_categorical('per_device_train_batch_size', [16, 32]),
        'gradient_accumulation_steps': trial.suggest_categorical('gradient_accumulation_steps', [1, 2, 4]),
    }

def hp_name(trial):
    return f'trial_{trial.number}'

tokenizer = AutoTokenizer.from_pretrained(MODEL)

def tokenize(examples):
    return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)

small_train = small_train.map(tokenize, batched=True, batch_size=512)
small_val = small_val.map(tokenize, batched=True, batch_size=512)
small_train = small_train.remove_columns(['text'])
small_val = small_val.remove_columns(['text'])

training_args = TrainingArguments(
    per_device_eval_batch_size=64,
    metric_for_best_model='accuracy',
    eval_strategy='steps',
    save_strategy='no',
    logging_dir='runs',
    report_to=['tensorboard'],
)

trainer = Trainer(
    model=None,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_val,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
    model_init=model_init,
)

best_trials = trainer.hyperparameter_search(
    direction='maximize',
    backend='optuna',
    hp_space=hp_space,
    hp_name=hp_name,
    n_trials=20,
)

# View best trial results
print("Best trial:")
print(best_trials)

Map:   0%|          | 0/52000 [00:00<?, ? examples/s]

Map:   0%|          | 0/13000 [00:00<?, ? examples/s]

[I 2025-11-29 13:36:52,895] A new study created in memory with name: no-name-1fed67ce-cdff-4a00-8011-c5dabbf34317


Step,Training Loss,Validation Loss,Accuracy
500,1.3209,1.176812,0.481615
1000,1.1614,1.122707,0.514615
1500,1.0984,1.115884,0.522538
2000,1.0139,1.051019,0.538769
2500,0.9671,1.083897,0.527385
3000,0.9697,1.057081,0.547692
3500,0.9009,1.0436,0.559077
4000,0.8251,1.061784,0.551846
4500,0.8216,1.061319,0.558154


[I 2025-11-29 14:13:31,347] Trial 0 finished with value: 0.5581538461538461 and parameters: {'learning_rate': 0.00022290597699294604, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 2}. Best is trial 0 with value: 0.5581538461538461.


Step,Training Loss,Validation Loss,Accuracy
500,1.0018,0.945491,0.585846
1000,0.9159,0.894021,0.607923
1500,0.8859,0.863582,0.615385
2000,0.8017,0.855123,0.627692
2500,0.7576,0.88553,0.620846
3000,0.7639,0.875465,0.633538
3500,0.72,0.856977,0.634462
4000,0.6582,0.882733,0.635538
4500,0.6572,0.872344,0.635923


[I 2025-11-29 14:50:11,160] Trial 1 finished with value: 0.6359230769230769 and parameters: {'learning_rate': 2.013520416023338e-05, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 2}. Best is trial 1 with value: 0.6359230769230769.


Step,Training Loss,Validation Loss,Accuracy
500,1.1026,0.975195,0.576154
1000,0.904,0.937313,0.592538
1500,0.8053,0.926634,0.606462
2000,0.6559,0.99663,0.608538


[I 2025-11-29 15:21:17,988] Trial 2 finished with value: 0.6085384615384616 and parameters: {'learning_rate': 0.00017763162167707802, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 2}. Best is trial 1 with value: 0.6359230769230769.


Step,Training Loss,Validation Loss,Accuracy
500,0.9715,0.909331,0.604769
1000,0.8324,0.8648,0.627154
1500,0.7303,0.870182,0.632846
2000,0.6156,0.934963,0.638692


[I 2025-11-29 15:53:37,654] Trial 3 finished with value: 0.6386923076923077 and parameters: {'learning_rate': 6.211076512403111e-05, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 2}. Best is trial 3 with value: 0.6386923076923077.


Step,Training Loss,Validation Loss,Accuracy
500,0.9746,0.903017,0.600692
1000,0.8699,0.865635,0.621154
1500,0.7996,0.866049,0.624615
2000,0.7589,0.861357,0.631692


[I 2025-11-29 16:25:06,642] Trial 4 finished with value: 0.6316923076923077 and parameters: {'learning_rate': 1.5634921716913148e-05, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 2}. Best is trial 3 with value: 0.6386923076923077.


Step,Training Loss,Validation Loss,Accuracy
500,1.0108,0.933953,0.584231


[I 2025-11-29 16:31:32,185] Trial 5 pruned. 


Step,Training Loss,Validation Loss,Accuracy
500,0.962,0.891577,0.608692
1000,0.8531,0.856412,0.628385
1500,0.7744,0.873531,0.629308
2000,0.718,0.868315,0.634077


[I 2025-11-29 17:04:10,159] Trial 6 finished with value: 0.6340769230769231 and parameters: {'learning_rate': 2.238414478087125e-05, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 4}. Best is trial 3 with value: 0.6386923076923077.


Step,Training Loss,Validation Loss,Accuracy
500,1.0091,0.943467,0.585923


[I 2025-11-29 17:07:59,179] Trial 7 pruned. 


Step,Training Loss,Validation Loss,Accuracy
500,1.6243,1.635516,0.238462


[I 2025-11-29 17:10:24,560] Trial 8 pruned. 


Step,Training Loss,Validation Loss,Accuracy
500,1.0063,0.930518,0.585154


[I 2025-11-29 17:17:11,602] Trial 9 pruned. 


Step,Training Loss,Validation Loss,Accuracy
500,0.9122,0.876316,0.627846
1000,0.6753,0.918065,0.638769


[I 2025-11-29 17:45:54,025] Trial 10 finished with value: 0.6387692307692308 and parameters: {'learning_rate': 9.007381400028066e-05, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 4}. Best is trial 10 with value: 0.6387692307692308.


Step,Training Loss,Validation Loss,Accuracy
500,0.9048,0.86021,0.628462
1000,0.6825,0.912292,0.638385


[I 2025-11-29 18:14:34,570] Trial 11 finished with value: 0.6383846153846154 and parameters: {'learning_rate': 7.650215309607524e-05, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 4}. Best is trial 10 with value: 0.6387692307692308.


Step,Training Loss,Validation Loss,Accuracy
500,0.9034,0.862626,0.629846
1000,0.6854,0.905856,0.640846


[I 2025-11-29 18:43:15,261] Trial 12 finished with value: 0.6408461538461538 and parameters: {'learning_rate': 7.26692474158267e-05, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 4}. Best is trial 12 with value: 0.6408461538461538.


Step,Training Loss,Validation Loss,Accuracy
500,0.9209,0.865028,0.629846
1000,0.6703,0.947537,0.630692


[I 2025-11-29 19:11:56,051] Trial 13 finished with value: 0.6306923076923077 and parameters: {'learning_rate': 0.00011152097837988845, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 4}. Best is trial 12 with value: 0.6408461538461538.


Step,Training Loss,Validation Loss,Accuracy
500,1.6186,1.609553,0.196538


[I 2025-11-29 19:23:45,387] Trial 14 pruned. 


Step,Training Loss,Validation Loss,Accuracy
500,0.9105,0.854668,0.623769
1000,0.736,0.865332,0.636538


[I 2025-11-29 19:52:27,100] Trial 15 finished with value: 0.6365384615384615 and parameters: {'learning_rate': 3.775063166235328e-05, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 4}. Best is trial 12 with value: 0.6408461538461538.


Step,Training Loss,Validation Loss,Accuracy
500,0.9233,0.862633,0.628385
1000,0.671,0.939766,0.635769


[I 2025-11-29 20:21:08,286] Trial 16 finished with value: 0.6357692307692308 and parameters: {'learning_rate': 0.00011523364997135204, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 4}. Best is trial 12 with value: 0.6408461538461538.


Step,Training Loss,Validation Loss,Accuracy
500,0.9043,0.853567,0.627
1000,0.7151,0.87802,0.638462


[I 2025-11-29 20:49:48,270] Trial 17 finished with value: 0.6384615384615384 and parameters: {'learning_rate': 4.829577255645158e-05, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 4}. Best is trial 12 with value: 0.6408461538461538.


Step,Training Loss,Validation Loss,Accuracy
500,0.9581,0.888054,0.613846


[I 2025-11-29 20:56:25,665] Trial 18 pruned. 


Step,Training Loss,Validation Loss,Accuracy
500,1.0662,0.984317,0.575846


[I 2025-11-29 21:00:04,272] Trial 19 pruned. 


Best trial:
BestRun(run_id='12', objective=0.6408461538461538, hyperparameters={'learning_rate': 7.26692474158267e-05, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 4}, run_summary=None)


AttributeError: 'str' object has no attribute 'objective'