# Fine-tuning BERT with LoRA

# Dependencies


In [1]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (

In [2]:
from datasets import *
from transformers import RobertaModel, RobertaTokenizer

import torch
import torch.nn.functional as F

In [3]:
from torch.utils.data import DataLoader, Dataset

# Data

The models will be fine-tuned on the [yelp_polarity](https://huggingface.co/datasets/yelp_polarity) dataset.

In [4]:
ds = load_dataset('yelp_polarity')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.35k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

The repository for yelp_polarity contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/yelp_polarity.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/166M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/560000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/38000 [00:00<?, ? examples/s]

In [5]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 560000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 38000
    })
})


In [6]:
print(ds['train'][0]['text'])
print(ds['train'][0]['label'])

Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars.
0


To save on time and memory, create a smaller subset of the full dataset.

In [7]:
TRAIN_SUBSET_SIZE = 30000
TEST_SUBSET_SIZE = 5000

In [9]:
train_dataset = ds['train'].shuffle(seed=31).select(range(TRAIN_SUBSET_SIZE))
test_dataset = ds['test'].shuffle(seed=31).select(range(TEST_SUBSET_SIZE))

### Tokenize

In [10]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base", truncation=True, do_lower_case=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [11]:
print("Tokenizer max input length:", tokenizer.model_max_length)
print("Tokenizer vocabulary size:", tokenizer.vocab_size)

Tokenizer max input length: 512
Tokenizer vocabulary size: 50265


In [12]:
MAX_LENGTH = 288

In [13]:
def tokenize_text(batch):
  return tokenizer(batch["text"],
                   padding=True,
                   truncation=True,
                   return_token_type_ids=True,
                   max_length=MAX_LENGTH)

In [14]:
tokenized_train_dataset = train_dataset.map(tokenize_text, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_text, batched=True)

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [15]:
print(tokenized_train_dataset)

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 30000
})


In [16]:
print(tokenized_test_dataset)

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5000
})


In [17]:
columns=["label", "input_ids", "attention_mask", "token_type_ids"]

tokenized_train_dataset.set_format("torch", columns=columns)
tokenized_test_dataset.set_format("torch", columns=columns)

### DataSet Class

In [18]:
BATCH_SIZE = 16

In [19]:
class MyDataset(Dataset):
  def __init__(self, dataset, partition_key):
    self.dataset = dataset

  def __getitem__(self, index):
    return self.dataset[index]

  def __len__(self):
    return self.dataset.num_rows

In [20]:
train_data = MyDataset(tokenized_train_dataset, partition_key="train")
test_data = MyDataset(tokenized_test_dataset, partition_key="test")

### Set up DataLoaders

In [21]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=BATCH_SIZE)

# RoBERTa base model
First, I'll use the [RoBERTa](https://huggingface.co/FacebookAI/roberta-base) pretrained base model, and add some classification layers on top of it.

In [22]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Custom class with RoBERTa model and fully connected layers for classification.

In [23]:
class RobertaWithClassification(torch.nn.Module):
  def __init__(self):
    super(RobertaWithClassification, self).__init__()
    self.roberta = RobertaModel.from_pretrained("roberta-base")
    self.linear = torch.nn.Linear(768, 768)
    self.activation = torch.nn.ReLU()
    self.dropout = torch.nn.Dropout(0.3)
    self.classifier = torch.nn.Linear(768, 2)

  # output of the roberta model:
  # https://huggingface.co/transformers/v3.2.0/main_classes/output.html#basemodeloutputwithpooling
  def forward(self, input_ids, attention_mask, token_type_ids):
    output_with_pooling = self.roberta(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    hidden_state = output_with_pooling[0]
    pooler = hidden_state[:,0]
    pooler = self.linear(pooler)
    pooler = self.activation(pooler)
    pooler = self.dropout(pooler)
    output = self.classifier(pooler)
    return output

In [24]:
model = RobertaWithClassification()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
model.to(device)

RobertaWithClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Layer

### Total trainable parameters

In [26]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [27]:
base_param_count = count_parameters(model)
print(base_param_count)

125237762


### Fine-tuning

In [28]:
import time

In [29]:
lr = 1e-5
EPOCHS = 3

In [30]:
def get_accuracy(y_pred, targets):
  predictions = torch.log_softmax(y_pred, dim=1).argmax(dim=1)
  accuracy = (predictions == targets).sum() / len(targets)
  return accuracy

In [31]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
loss_function = torch.nn.CrossEntropyLoss()

In [32]:
def train(model, train_loader, epochs, optimizer):
  total_time = 0

  for epoch in range(epochs):
    interval = len(train_loader) // 5

    total_train_loss = 0
    total_train_acc = 0

    start = time.time()

    model.train()
    for batch_idx, batch in enumerate(train_loader):
      optimizer.zero_grad()

      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      token_type_ids = batch["token_type_ids"].to(device)
      labels = batch["label"].to(device)

      outputs = model(input_ids,
                      attention_mask=attention_mask,
                      token_type_ids=token_type_ids)

      loss = loss_function(outputs, labels)
      acc = get_accuracy(outputs, labels)

      total_train_loss += loss.item()
      total_train_acc += acc.item()

      loss.backward()
      optimizer.step()

      if (batch_idx + 1) % interval == 0:
        print("Batch: %s/%s | Training loss: %.4f | accuracy: %.4f" % (batch_idx+1, len(train_loader), loss, acc))

    train_loss = total_train_loss / len(train_loader)
    train_acc = total_train_acc / len(train_loader)

    end = time.time()
    hours, remainder = divmod(end - start, 3600)
    minutes, seconds = divmod(remainder, 60)

    print(f"Epoch: {epoch+1} train loss: {train_loss:.4f} train acc: {train_acc:.4f}")
    print("Epoch time elapsed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
    print("")

    total_time += (end - start)

  # Get the average time per epoch
  average_time_per_epoch = total_time / epochs
  hours, remainder = divmod(average_time_per_epoch, 3600)
  minutes, seconds = divmod(remainder, 60)

  print("Average time per epoch: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))

In [33]:
train(model, train_loader, EPOCHS, optimizer)

Batch: 375/1875 | Training loss: 0.1650 | accuracy: 0.9375
Batch: 750/1875 | Training loss: 0.0386 | accuracy: 1.0000
Batch: 1125/1875 | Training loss: 0.1929 | accuracy: 0.9375
Batch: 1500/1875 | Training loss: 0.0474 | accuracy: 1.0000
Batch: 1875/1875 | Training loss: 0.0161 | accuracy: 1.0000
Epoch: 1 train loss: 0.1403 train acc: 0.9442
Epoch time elapsed: 00:23:33.51

Batch: 375/1875 | Training loss: 0.1753 | accuracy: 0.8750
Batch: 750/1875 | Training loss: 0.1083 | accuracy: 1.0000
Batch: 1125/1875 | Training loss: 0.0148 | accuracy: 1.0000
Batch: 1500/1875 | Training loss: 0.0712 | accuracy: 0.9375
Batch: 1875/1875 | Training loss: 0.0123 | accuracy: 1.0000
Epoch: 2 train loss: 0.0767 train acc: 0.9726
Epoch time elapsed: 00:23:37.15

Batch: 375/1875 | Training loss: 0.0013 | accuracy: 1.0000
Batch: 750/1875 | Training loss: 0.0634 | accuracy: 0.9375
Batch: 1125/1875 | Training loss: 0.0384 | accuracy: 1.0000
Batch: 1500/1875 | Training loss: 0.0302 | accuracy: 1.0000
Batch: 1

### Evaluation

In [34]:
def evaluate(model, test_loader):
  interval = len(test_loader) // 5

  total_test_loss = 0
  total_test_acc = 0

  model.eval()
  with torch.no_grad():
    for batch_idx, batch in enumerate(test_loader):
      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      token_type_ids = batch["token_type_ids"].to(device)
      labels = batch["label"].to(device)

      outputs = model(input_ids,
                      attention_mask=attention_mask,
                      token_type_ids=token_type_ids)
      loss = loss_function(outputs, labels)
      acc = get_accuracy(outputs, labels)

      total_test_loss += loss.item()
      total_test_acc += acc.item()

      if (batch_idx + 1) % interval == 0:
        print("Batch: %s/%s | Test loss: %.4f | accuracy: %.4f" % (batch_idx+1, len(test_loader), loss, acc))

  test_loss = total_test_loss / len(test_loader)
  test_acc = total_test_acc / len(test_loader)

  print(f"Test loss: {test_loss:.4f} acc: {test_acc:.4f}")
  print("")

In [35]:
evaluate(model, test_loader)

Batch: 62/313 | Test loss: 0.0025 | accuracy: 1.0000
Batch: 124/313 | Test loss: 0.2105 | accuracy: 0.9375
Batch: 186/313 | Test loss: 0.0112 | accuracy: 1.0000
Batch: 248/313 | Test loss: 0.3033 | accuracy: 0.9375
Batch: 310/313 | Test loss: 0.4428 | accuracy: 0.9375
Test loss: 0.1166 acc: 0.9631



# Fine-tuning RoBERTa with LoRA Layers

### LoRA Layer

In [36]:
import math

In [39]:
class LoRALayer(torch.nn.Module):
  def __init__(self, in_dim, out_dim, r, alpha):
    super().__init__()
    self.r = r
    self.alpha = alpha

    # Initialize A to kaiming uniform
    self.A = torch.nn.Parameter(torch.empty(r, in_dim))
    # Initialize B to zeros.
    self.B = torch.nn.Parameter(torch.empty(out_dim, r))
    torch.nn.init.kaiming_uniform_(self.A, a=math.sqrt(5))
    torch.nn.init.zeros_(self.B)

    self.scaling = self.alpha / self.r

  def forward(self, x):
    x_A = x @ self.A.transpose(0, 1)
    x_B = x_A @ self.B.transpose(0, 1)
    x = self.scaling * x_B
    return x

In [40]:
class LinearWithLoRA(torch.nn.Module):
  def __init__(self, linear, r, alpha):
    super().__init__()
    self.linear = linear
    self.lora = LoRALayer(
        linear.in_features, linear.out_features, r, alpha
    )

  def forward(self, x):
    return self.linear(x) + self.lora(x)

In [41]:
lora_model = RobertaWithClassification()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Freeze the model parameters

In [42]:
for param in lora_model.parameters():
  param.requires_grad = False

Add LoRA to the Query and Value in the Attention layers

In [43]:
from functools import partial

In [44]:
lora_r = 16
lora_alpha = lora_r * 2

assign_lora = partial(LinearWithLoRA, r=lora_r, alpha=lora_alpha)

In [45]:
print(lora_model)

RobertaWithClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Layer

In [46]:
for layer in lora_model.roberta.encoder.layer:
  layer.attention.self.query = assign_lora(layer.attention.self.query)
  layer.attention.self.value = assign_lora(layer.attention.self.value)

Total trainable parameters with LoRA layers

In [47]:
lora_param_count = count_parameters(lora_model)
print("Model with LoRA param count:", lora_param_count)
print("Base model param count:", base_param_count)
print(str(base_param_count // lora_param_count) + " times smaller than base model")

Model with LoRA param count: 589824
Base model param count: 125237762
212 times smaller than base model


Compared to the base model, there are much fewer parameters to train in the model with the LoRA layers: 590K vs 125M.

In [48]:
lora_model.to(device)

RobertaWithClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): LinearWithLoRA(
                (linear): Linear(in_features=768, out_features=768, bias=True)
                (lora): LoRALayer()
              )
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): LinearWithLoRA(
                (linear): Linear(in_features=768, out_features=768, bias=True)
                (lora): LoRALayer()
              )
              (dro

### Fine-tuning

In [49]:
lr = 1e-5
EPOCHS = 3

In [50]:
optimizer_lora = torch.optim.Adam(params=lora_model.parameters(), lr=lr)
loss_function = torch.nn.CrossEntropyLoss()

In [51]:
train(lora_model, train_loader, EPOCHS, optimizer_lora)

Batch: 375/1875 | Training loss: 0.6814 | accuracy: 0.5625
Batch: 750/1875 | Training loss: 0.6363 | accuracy: 0.6875
Batch: 1125/1875 | Training loss: 0.5035 | accuracy: 0.8125
Batch: 1500/1875 | Training loss: 0.2084 | accuracy: 0.9375
Batch: 1875/1875 | Training loss: 0.1081 | accuracy: 0.9375
Epoch: 1 train loss: 0.4040 train acc: 0.7788
Epoch time elapsed: 00:17:01.51

Batch: 375/1875 | Training loss: 0.2018 | accuracy: 0.9375
Batch: 750/1875 | Training loss: 0.1311 | accuracy: 1.0000
Batch: 1125/1875 | Training loss: 0.2974 | accuracy: 0.9375
Batch: 1500/1875 | Training loss: 0.0531 | accuracy: 1.0000
Batch: 1875/1875 | Training loss: 0.0752 | accuracy: 1.0000
Epoch: 2 train loss: 0.1509 train acc: 0.9506
Epoch time elapsed: 00:17:00.96

Batch: 375/1875 | Training loss: 0.1188 | accuracy: 0.9375
Batch: 750/1875 | Training loss: 0.1741 | accuracy: 0.9375
Batch: 1125/1875 | Training loss: 0.3269 | accuracy: 0.8750
Batch: 1500/1875 | Training loss: 0.1344 | accuracy: 0.9375
Batch: 1

### Evaluation

In [52]:
evaluate(lora_model, test_loader)

Batch: 62/313 | Test loss: 0.0220 | accuracy: 1.0000
Batch: 124/313 | Test loss: 0.2065 | accuracy: 0.9375
Batch: 186/313 | Test loss: 0.0988 | accuracy: 0.9375
Batch: 248/313 | Test loss: 0.0507 | accuracy: 1.0000
Batch: 310/313 | Test loss: 0.2075 | accuracy: 0.8750
Test loss: 0.1288 acc: 0.9551



# Results Comparisons


Base model param count: 125237762

Model with LoRA param count: 589824, 212 times smaller than base model

**Average time per epoch**
- Without LoRA: 23 minutes 36 seconds
- With LoRA: 17 minutes 1 seconds

**Test set Accuracy**
- Without LoRA: 96.31%
- With LoRA: 95.51%

While being 200x smaller than the base model, the LoRA model is about 25% faster during fine-tuning, and has a test set accuracy of 95.51% compared to 96.31% of the base model, after 3 epochs of training.