In [None]:
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

MODEL='tabularisai/multilingual-sentiment-analysis'

In [None]:
ds = load_dataset('Yelp/yelp_review_full')
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [None]:
# create a 10% random subsample of the original train split
small_subset = ds['train'].train_test_split(train_size=0.1, seed=42)['train']

# split that subsample into 80% train / 20% validation
split = small_subset.train_test_split(test_size=0.2, seed=42)
small_train = split['train']
small_val = split['test']  # validation set

# quick sanity checks
print('original train:', len(ds['train']))
print('subsample (10%):', len(small_subset))
print('small train (80% of subsample):', len(small_train))
print('small val (20% of subsample):', len(small_val))

original train: 650000
subsample (10%): 65000
small train (80% of subsample): 52000
small val (20% of subsample): 13000


In [None]:
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def model_init(trial):
    return AutoModelForSequenceClassification.from_pretrained(MODEL)

def hp_space(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 5e-6, 5e-4, log=True),
        'per_device_train_batch_size': trial.suggest_categorical('per_device_train_batch_size', [16, 32]),
        'gradient_accumulation_steps': trial.suggest_categorical('gradient_accumulation_steps', [1, 2, 4]),
    }

def hp_name(trial):
    return f'trial_{trial.number}'

tokenizer = AutoTokenizer.from_pretrained(MODEL)

def tokenize(examples):
    return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)

small_train = small_train.map(tokenize, batched=True, batch_size=512)
small_val = small_val.map(tokenize, batched=True, batch_size=512)
small_train = small_train.remove_columns(['text'])
small_val = small_val.remove_columns(['text'])

training_args = TrainingArguments(
    per_device_eval_batch_size=64,
    metric_for_best_model='accuracy',
    eval_strategy='steps',
    save_strategy='no',
    logging_dir='runs',
    report_to=['tensorboard'],
)

trainer = Trainer(
    model=None,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_val,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
    model_init=model_init,
)

best_trials = trainer.hyperparameter_search(
    direction='maximize',
    backend='optuna',
    hp_space=hp_space,
    hp_name=hp_name,
    n_trials=20,
)

# View best trial results
print("Best trial:")
print(best_trials)

# Access detailed results
for trial in best_trials:
    print(f"Accuracy: {trial.objective}, Params: {trial.hyperparameters}")

# Save results to file for analysis
import json
with open('hyperparameter_search_results.json', 'w') as f:
    json.dump(best_trials, f, indent=2)

[I 2025-11-29 13:04:47,584] A new study created in memory with name: no-name-aba36176-4735-4f4c-90d5-4ab2142ce350
[W 2025-11-29 13:04:48,158] Trial 0 failed with parameters: {'learning_rate': 0.00015169521149697708, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 4, 'num_train_epochs': 3, 'warmup_steps': 178, 'weight_decay': 0.12415274740207666} because of the following error: ValueError("You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']").
Traceback (most recent call last):
  File "/home/neleac/yelp-review-classifier/.venv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/home/neleac/yelp-review-classifier/.venv/lib/python3.12/site-packages/transformers/integrations/integration_utils.py", line 277, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/home/n

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']