# Training RoBERTa with LoRA: Train with online text augmentations
Lai Ye

In [1]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle

  from .autonotebook import tqdm as notebook_tqdm





# Define dataset
### We will define a data collator to augment & tokenize training samples later. Do not tokenize them now

In [2]:
base_model = 'roberta-base'

dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(base_model)

def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized

# tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
# tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

### Tokenize evaluation set ONLY, since the training set will be augmented online in training

In [3]:
split_datasets = dataset.train_test_split(test_size=640, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']
eval_dataset = eval_dataset.map(preprocess, batched=True, remove_columns=["text"])

Map: 100%|██████████| 640/640 [00:00<00:00, 2168.91 examples/s]


In [4]:
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

id2label = {i: label for i, label in enumerate(class_names)}

number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


### Define data collator with augmentations
Require package: nlpaug

In [5]:
import nlpaug.augmenter.word as naw
import nlpaug.flow as nafc

class OnlineClassificationCollator:
    def __init__(self, tokenizer, max_length=363, augment=True):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augment = augment

        # define random crop augmentation: 50% chance happen
        self.crop_augmenter = nafc.Sometimes([
            naw.RandomWordAug(action='crop', aug_p=0.5, aug_max=None, aug_min=None)
        ],aug_p=0.5)

        # define a compositional augmentation pipeline: Synonym, delete, swap: 50% chance happen
        self.augmenter = nafc.Sometimes([
            naw.SynonymAug(aug_src='wordnet', aug_p=0.3, aug_max=None, aug_min=None),
            naw.RandomWordAug(action='delete', aug_p=0.05, aug_max=None, aug_min=None),
            naw.RandomWordAug(action='swap', aug_p=0.1, aug_max=None, aug_min=None),
        ],aug_p=0.5)

    def augment_text(self, text):
        aug = self.crop_augmenter.augment(text)
        if isinstance(aug, list):
            aug = aug[0]
        if not isinstance(aug, str) or not aug.strip():
            aug = text  # fallback to original if crop is empty
        aug = self.augmenter.augment(aug)
        if isinstance(aug, list):
            aug = aug[0]
        if not isinstance(aug, str) or not aug.strip():
            aug = text  
        return aug

    def __call__(self, features):
        if self.augment:
            texts = [self.augment_text(f['text']) for f in features]
        else:
            texts = [f['text'] for f in features]

        labels = [f['label'] for f in features]

        tokenized = self.tokenizer(
            texts,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        tokenized['labels'] = torch.tensor(labels, dtype=torch.long)
        return tokenized

train_collator = OnlineClassificationCollator(tokenizer, max_length=363, augment=True)
eval_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

Test crop augmentor

In [7]:
naw.RandomWordAug('crop',aug_p=0.5).augment("1 2 3 4 5 6 7 8 9 10")

['1 2 8 9 10']

In [26]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label) 

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### My experiments show that r=2 with only query fine-tuned works the best

In [12]:
# PEFT Config
peft_config = LoraConfig(
    r=2,
    lora_alpha=4,
    lora_dropout=0.05,
    bias = 'none',
    target_modules = ['query'],
    task_type="SEQ_CLS",
)

In [13]:
peft_model = get_peft_model(model, peft_config)
peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): Mo

In [14]:
print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 630,532 || all params: 125,279,240 || trainable%: 0.5033


## Accuracy metrics

In [15]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }

## Training configurations
1. set remove_unused_columns=False, since the train set (augmented) and eval set have different keys
2. set dataloader_num_workers=0 to prevent unknown parallel processing crash

In [16]:

# Setup Training args
output_dir = "results"
training_args = TrainingArguments(
    output_dir=output_dir,
    report_to=None,
    eval_strategy='steps',
    eval_steps=1000,
    save_strategy='steps',
    save_steps=1000,
    logging_steps=1000,
    learning_rate=1e-4,
    num_train_epochs=5,
    use_cpu=False,
    dataloader_num_workers=0,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    optim="adamw_torch",
    gradient_checkpointing=False,
    gradient_checkpointing_kwargs={'use_reentrant':True},
    load_best_model_at_end=True,
    remove_unused_columns = False
)

## Trainer definition
### Override the trainer since HuggingFace doesn't allow different collators for train and eval by default

In [17]:
from transformers import EarlyStoppingCallback
from transformers import Trainer

class TrainerWithSeparateCollators(Trainer):
    def __init__(self, *args, eval_data_collator=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.eval_data_collator = eval_data_collator

    def get_eval_dataloader(self, eval_dataset=None):
        # Temporarily patch the collator
        original_collator = self.data_collator
        if self.eval_data_collator is not None:
            self.data_collator = self.eval_data_collator

        dataloader = super().get_eval_dataloader(eval_dataset)

        # Restore original collator
        self.data_collator = original_collator
        return dataloader

def get_trainer(model):
    return TrainerWithSeparateCollators(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=train_collator,
        eval_data_collator=eval_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)], # early stop
    )


### Start Training

In [18]:
peft_lora_finetuning_trainer = get_trainer(peft_model)

result = peft_lora_finetuning_trainer.train()

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mly2414[0m ([33mly2414-new-york-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy
1000,0.667,0.310587,0.901563
2000,0.4699,0.304776,0.904687
3000,0.4438,0.303021,0.901563
4000,0.4212,0.296618,0.907813
5000,0.4254,0.293155,0.903125
6000,0.4186,0.290763,0.90625
7000,0.4107,0.29605,0.9125
8000,0.4064,0.286041,0.907813
9000,0.3962,0.287856,0.9125
10000,0.3897,0.286347,0.909375


In [25]:
result

TrainOutput(global_step=37300, training_loss=0.39356153800084825, metrics={'train_runtime': 7542.3556, 'train_samples_per_second': 79.126, 'train_steps_per_second': 4.945, 'total_flos': 1.1214962687808e+17, 'train_loss': 0.39356153800084825, 'epoch': 5.0})

## Evaluate Finetuned Model


### Run Inference on eval_dataset

In [19]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

We'll treat the evaluation dataset directly as the local test set. Overall, we have the Kaggle scoring test set for the ultimate test score. 

In [20]:
# Check evaluation accuracy
tempacc, _ = evaluate_model(peft_model, eval_dataset, True, 8, eval_collator)

100%|██████████| 80/80 [00:02<00:00, 36.23it/s]

Evaluation Metric: {'accuracy': 0.9125}





### Run Inference on unlabelled dataset

In [None]:
# restore trained PEFT model

# from peft import PeftConfig, PeftModel
# peft_model_dir = "results/checkpoint-37300"
# peft_cfg_load = PeftConfig.from_pretrained(peft_model_dir)
# peft_model = PeftModel.from_pretrained(model, peft_model_dir)
# peft_model.to('cuda')
# peft_model.eval();

In [22]:
#Load your unlabelled data'
import pandas as pd
unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
unlabelled_dataset

Map: 100%|██████████| 8000/8000 [00:02<00:00, 2768.53 examples/s]


Dataset({
    features: ['text'],
    num_rows: 8000
})

In [28]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, eval_collator)
output_dir = 'results'
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"inference_output.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")

100%|██████████| 1000/1000 [00:27<00:00, 36.99it/s]

Inference complete. Predictions saved to inference_output.csv





(Optional) compare with old predictions for unlabelled dataset

In [None]:
# import numpy as np
# import pandas as pd
# p1 = pd.read_csv("results/inference_output.csv")
# p1 = p1['Label'].to_numpy()
# p2 = pd.read_csv("results/inference_output.csv")
# p2 = p2['Label'].to_numpy()
# np.equal(p1, p2).sum()