In [None]:
from datasets import load_dataset
import torch
import transformers
from transformers import AutoTokenizer, AutoConfig, PretrainedConfig, RobertaForSequenceClassification
from transformers.models.roberta.modeling_roberta import RobertaClassificationHead

In [None]:
dataset = load_dataset("yelp_review_full", split='train[:100]')

In [None]:
dataset

In [None]:
model_id = 'roberta-base'

# Config for the encoder.
roberta_classifier_config = AutoConfig.from_pretrained(
    model_id,
    finetuning_task="text-classification",
    id2label={
        i: label
        for i, label in enumerate(range(5))
    },
    label2id={
        label: i
        for i, label in enumerate(range(5))
    }
)

# Config for the classification head. These are all the
# parameters a `RobertaClassificationHead` requires.
roberta_classification_head_config = PretrainedConfig()

roberta_classification_head_config.classifier_dropout = 0.1
roberta_classification_head_config.hidden_size = 64
roberta_classification_head_config.num_labels = 5

In [None]:
# Instantiate tokenizer.
roberta_tokenizer = AutoTokenizer.from_pretrained(model_id)

# Instantiate RoBERTa model.
roberta_classifier = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    config=roberta_classifier_config,
)

# Substitute the default classification head with a custom one.
classification_head = RobertaClassificationHead(roberta_classification_head_config)
classification_head.dense = torch.nn.Linear(
    roberta_classifier.config.hidden_size,  # The `in_features` parameter must be equal to the encoder's hidden size.
    roberta_classification_head_config.hidden_size,
)

roberta_classifier.classifier = classification_head

In [None]:
def tokenize_function(examples):
    return roberta_tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets

In [None]:
tokenized_datasets[0].keys()

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = transformers.Trainer(
    model=roberta_classifier,
    args=training_args,
    train_dataset=tokenized_datasets,
    # eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
tokenized_datasets