# Fine-tune an XLM RoBERTa model

In [1]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_metric
import torch
import numpy as np
from sklearn.metrics import classification_report

from torch import cuda
from os.path import join

In [2]:
LR = 2e-5
EPOCHS = 1
BATCH_SIZE = 32
MODEL = "models/twitter-xlm-roberta-base"
MAX_TRAINING_EXAMPLES = 100
MAX_TEST_EXAMPLES = int(MAX_TRAINING_EXAMPLES/2)
MAX_VAL_EXAMPLES = int(MAX_TEST_EXAMPLES/2)

In [3]:
dataset_dict = {}
for i in ['train','val','test']:
    dataset_dict[i] = {}
    for j in ['text','labels']:
        dataset_dict[i][j] = open(f"../../data/traindata/{i}_{j}.txt")\
            .read().strip('\n').split('\n')
        if j == 'labels':
            dataset_dict[i][j] = [int(x) for x in dataset_dict[i][j]]

if MAX_TRAINING_EXAMPLES > 0:
    dataset_dict['train']['text'] = \
        dataset_dict['train']['text'][:MAX_TRAINING_EXAMPLES]
    dataset_dict['train']['labels'] = \
        dataset_dict['train']['labels'][:MAX_TRAINING_EXAMPLES]
    
if MAX_TEST_EXAMPLES > 0:
    dataset_dict['test']['text'] = \
        dataset_dict['test']['text'][:MAX_TEST_EXAMPLES]
    dataset_dict['test']['labels'] = \
        dataset_dict['test']['labels'][:MAX_TEST_EXAMPLES]

if MAX_VAL_EXAMPLES > 0:
    dataset_dict['val']['text'] = \
        dataset_dict['val']['text'][:MAX_VAL_EXAMPLES]
    dataset_dict['val']['labels'] = \
        dataset_dict['val']['labels'][:MAX_VAL_EXAMPLES]

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

In [5]:
train_encodings = tokenizer(dataset_dict['train']['text'], truncation=True, padding=True)
val_encodings = tokenizer(dataset_dict['val']['text'], truncation=True, padding=True)
test_encodings = tokenizer(dataset_dict['test']['text'], truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [6]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MyDataset(train_encodings, dataset_dict['train']['labels'])
val_dataset = MyDataset(val_encodings, dataset_dict['val']['labels'])
test_dataset = MyDataset(test_encodings, dataset_dict['test']['labels'])

In [15]:
training_args = TrainingArguments(
    output_dir='./results',                   # output directory
    num_train_epochs=EPOCHS,                  # total number of training epochs
    per_device_train_batch_size=BATCH_SIZE,   # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,    # batch size for evaluation
    warmup_steps=100,                         # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                        # strength of weight decay
    logging_dir='./logs',                     # directory for storing logs
    logging_steps=10,                         # when to print log
    load_best_model_at_end=True,              # load or not best model at the end
    save_strategy='no',
    eval_accumulation_steps=1,
)

num_labels = len(set(dataset_dict["train"]["labels"]))
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file models/twitter-xlm-roberta-base/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "/home/jupyter/misc/tweeteval/TweetEval_models/xlm-twitter/twitter-xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past

In [16]:
metric = load_metric("f1")
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [17]:
trainer = Trainer(
        model=model,
        args=training_args,
        #device = device,
        train_dataset = train_dataset,
        eval_dataset = val_dataset,
        #data_collator = data_collator,
        #tokenizer = tokenizer,
        compute_metrics = compute_metrics
)
trainer.train()

***** Running training *****
  Num examples = 100
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 4


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=4, training_loss=0.6951256990432739, metrics={'train_runtime': 29.1294, 'train_samples_per_second': 3.433, 'train_steps_per_second': 0.137, 'total_flos': 5447221068000.0, 'train_loss': 0.6951256990432739, 'epoch': 1.0})

In [18]:
trainer.save_model("./results/best_model") # save best model

Saving model checkpoint to ./results/best_model
Configuration saved in ./results/best_model/config.json
Model weights saved in ./results/best_model/pytorch_model.bin


In [17]:
test_preds_raw, test_labels , _ = trainer.predict(test_dataset)
test_preds = np.argmax(test_preds_raw, axis=-1)
print(classification_report(test_labels, test_preds, digits=3))

***** Running Prediction *****
  Num examples = 50
  Batch size = 32


              precision    recall  f1-score   support

           0      0.360     1.000     0.529        18
           1      0.000     0.000     0.000        32

    accuracy                          0.360        50
   macro avg      0.180     0.500     0.265        50
weighted avg      0.130     0.360     0.191        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
