In [None]:
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

MODEL='tabularisai/multilingual-sentiment-analysis'

In [None]:
ds = load_dataset("Yelp/yelp_review_full")
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [2]:
# create a 10% random subsample of the original train split
small_subset = ds['train'].train_test_split(train_size=0.1, seed=42)['train']

# split that subsample into 80% train / 20% validation
split = small_subset.train_test_split(test_size=0.2, seed=42)
small_train = split['train']
small_val = split['test']  # validation set

# quick sanity checks
print("original train:", len(ds['train']))
print("subsample (10%):", len(small_subset))
print("small train (80% of subsample):", len(small_train))
print("small val (20% of subsample):", len(small_val))

original train: 650000
subsample (10%): 65000
small train (80% of subsample): 52000
small val (20% of subsample): 13000


In [None]:
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def model_init(trial):
    return AutoModelForSequenceClassification.from_pretrained(MODEL)

def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
    }

tokenizer = AutoTokenizer.from_pretrained(MODEL)

training_args = TrainingArguments(
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    logging_steps=200,
    metric_for_best_model='accuracy',
    eval_strategy='steps',
    save_strategy='best',
    output_dir='checkpoints',
)

trainer = Trainer(
    model=None,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_val,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
    model_init=model_init,
)

best_trials = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=20,
)