# OwOLoRA

## Problem

LoRA finetuning has emerged as a popular technique for training large language models. Instead of training a model from scratch, we can take a pre-trained model and fine-tune it on a smaller dataset.

In [1]:
import torch
from datasets import load_dataset, load_metric
from transformers import AutoModelForSequenceClassification

from transformers import AutoModel, Trainer, TrainingArguments, AutoTokenizer
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
model_name_or_path = "roberta-base"
tokenizer_name_or_path = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True)
# model = AutoModel.from_pretrained(model_name_or_path)
# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]

task = "cola"
actual_task = "mnli" if task == "mnli-mm" else task
dataset = load_dataset("glue", actual_task)
metric = load_metric('glue', actual_task)


# To preprocess our dataset, we will thus need the names of the columns containing the sentence(s). The following dictionary keeps track of the correspondence task to column names:
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}
sentence1_key, sentence2_key = task_to_keys[task]

def preprocess_function(examples):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)

num_labels = 3 if task.startswith("mnli") else 1 if task=="stsb" else 2
encoded_dataset = dataset.map(preprocess_function, batched=True)

  metric = load_metric('glue', actual_task)
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Map: 100%|██████████| 8551/8551 [00:00<00:00, 61655.39 examples/s]
Map: 100%|██████████| 1043/1043 [00:00<00:00, 46969.65 examples/s]
Map: 100%|██████████| 1063/1063 [00:00<00:00, 6624.32 examples/s]


In [29]:
# import pytorch_lightning as pl
# class PLModel()

model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, num_labels=num_labels)

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 887,042 || all params: 125,534,212 || trainable%: 0.7066


In [30]:
# DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

metric_name = "pearson" if task == "stsb" else "matthews_correlation" if task == "cola" else "accuracy"

batch_size = 16
model_name = model_name_or_path.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation"

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer, # Pass tokenizer again so that it pads correctly
    compute_metrics=compute_metrics
)

trainer.train()

trainer.save_model(f"{model_name}-finetuned-{task}")

  0%|          | 0/2675 [12:38<?, ?it/s]
