In [None]:
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

MODEL='tabularisai/multilingual-sentiment-analysis'

In [2]:
ds = load_dataset('Yelp/yelp_review_full')
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [3]:
# split 20% of the existing train split into a new 'val' split
split = ds['train'].train_test_split(test_size=0.2, seed=42)
ds['train'] = split['train']
ds['val'] = split['test']
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 520000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
    val: Dataset({
        features: ['label', 'text'],
        num_rows: 130000
    })
})

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def tokenize(examples):
    return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)

train_ds = ds['train']
val_ds = ds['val']

train_ds = train_ds.map(tokenize, batched=True, batch_size=512)
val_ds = val_ds.map(tokenize, batched=True, batch_size=512)

train_ds = train_ds.remove_columns(['text'])
val_ds = val_ds.remove_columns(['text'])

Map:   0%|          | 0/520000 [00:00<?, ? examples/s]

Map:   0%|          | 0/130000 [00:00<?, ? examples/s]

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    num_train_epochs=5,
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,

    logging_steps=1000,
    per_device_eval_batch_size=64,
    metric_for_best_model='accuracy',
    eval_strategy='steps',
    save_strategy='best',
    report_to=['tensorboard'],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)

trainer.train()

Step,Training Loss,Validation Loss,Accuracy
1000,0.9459,0.862962,0.623931
2000,0.8574,0.82594,0.641923
3000,0.831,0.802849,0.650646
4000,0.8136,0.790355,0.655446
5000,0.7981,0.788356,0.657569
6000,0.7898,0.764712,0.664831
7000,0.783,0.773131,0.661854
8000,0.7811,0.754248,0.669146
9000,0.742,0.767627,0.664638
10000,0.7364,0.757764,0.670092


TrainOutput(global_step=40625, training_loss=0.7079644313401442, metrics={'train_runtime': 46664.1895, 'train_samples_per_second': 55.717, 'train_steps_per_second': 0.871, 'total_flos': 3.44433662976e+17, 'train_loss': 0.7079644313401442, 'epoch': 5.0})