In [None]:
import datasets
from datasets import load_dataset, load_metric

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, DataCollator, TrainingArguments, Trainer
from dataclasses import dataclass, field

import torch

import random
import pandas as pd
import numpy as np

In [None]:
dataset = load_dataset('amazon_reviews_multi')

In [None]:
dataset["train"].head()

In [None]:
metric = load_metric('accuracy')
f1_metric = load_metric('f1')

In [None]:
do_shard=True
if do_shard:
    dataset = dataset.shuffle(seed=8855)
    train_dataset=dataset["train"].shard(index=1, num_shards=10)
    val_dataset=dataset["validation"].shard(index=1, num_shards=5)
else:
    train_dataset=dataset["train"]
    val_dataset=dataset["validation"]

In [None]:
model_checkpoint='./model'
tokenizer=AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [None]:
pad_to_max=False
def tokenize_data(example):
    text_ = example["review_body"] + " " + example["review_title"] + " " + example["product_category"]
    encodings = tokenizer(text_, pad_to_max_length=pad_to_max,
                                truncation=True,
                                add_special_tokens=True,
                                return_token_type_ids=False,
                                return_attention_mask=True,
                                return_overflowing_tokens=False,
                                return_special_tokens_mask=False,
                                )
    encodings["labels"] = example["stars"] - 1
    return encodings

In [None]:
encoded_train_dataset = train_dataset.map(tokenize_data)
encoded_val_dataset = val_dataset.map(tokenize_data)

In [None]:
def pad_seq(seq, max_batch_len, pad_value):
    return seq + (max_batch_len - len(seq)) * [pad_value]

In [None]:
@dataclass
class SmartCollator():
    pad_token_id: int

    def __call__(self, batch):
        batch_inputs = list()
        batch_attention_mask = list()
        labels = list()
        max_size = max(len(ex['input_ids']) for ex in batch)
        for item in batch:
            batch_inputs += [pad_seq(item['input_ids'], max_size, self.pad_token_id)]
            batch_attention_mask += [pad_seq(item['attention_mask'], max_size, 0)]
            labels.append(item['labels'])
        
        return {"input_ids": torch.tensor(batch_inputs, dtype=torch.long),
                "attention_mask": torch.tensor(batch_attention_mask, dtype=torch.long),
                "labels": torch.tensor(labels, dtype=torch.long)
                }

In [None]:
BATCH_SIZE = 8
NUM_LABELS = 5

resume_training = True
if resume_training:
    model_checkpoint = './model'
else:
    model_checkpoint = 'xlm-roberta-base'
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=NUM_LABELS)

In [None]:
METRIC_NAME = 'accuracy'

args = TrainingArguments(
    output_dir='./model',
    seed=8855,
    evaluation_strategy='steps',
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=METRIC_NAME,
    eval_steps=5000,
    save_steps=5000,
    fp16=True,
)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    return metric.compute(predictions, labels)

In [None]:
validation_key = "validation"
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_val_dataset,
    data_collator=SmartCollator(pad_token_id=tokenizer.pad_token_id),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()