# Efficiently train LLM with LoRA



In [None]:
# install Hugging Face Libraries
!pip install -q git+https://github.com/huggingface/transformers
!pip install -q datasets accelerate evaluate bitsandbytes loralib peft --upgrade
# install additional dependencies needed for training
!pip install -q rouge-score tensorboard py7zr

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m87.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load and prepare the dataset

we're using [samsum](https://huggingface.co/datasets/samsum) dataset, a collection of about 16k messenger-like conversations with summaries.

```Python
{
  "id": "13818513",
  "summary": "Amanda baked cookies and will bring Jerry some tomorrow.",
  "dialogue": "Amanda: I baked cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)"
}
```
we're using [flan-t5-small](https://huggingface.co/google/flan-t5-small) as the base model to apply peft+lora

In [None]:
# huggingface hub model id
model_id = "google/flan-t5-small"
# Define training args
output_dir="lora-flan-t5-small"
# Save our LoRA model & tokenizer results
peft_model_id = "results"
# user_id
user_id = "pritam3355"

In [None]:
from datasets import load_dataset

# Load dataset from the hub
dataset = load_dataset("samsum")

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")



Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Train dataset size: 14732
Test dataset size: 819


## Load model & tokenizer

Here we're using 8 bit precision  &  tell Accelerate to determine automatically where to put each layer of the model depending on the available resources
```Python
load_in_8bit=True
device_map="auto"

```


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

# Freezing the original weights

In [None]:
import torch
import torch.nn as nn

for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

# Setting up the LoRa Adapters

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model,prepare_model_for_int8_training, TaskType

config = LoraConfig(
    r=16, #attention heads, reduce this to reduce trainable parameters
    lora_alpha=32, #alpha scaling
    # target_modules=["q_proj", "v_proj"], # for LLaMa style models
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # set this for CAUSAL_LM or SEQ_2_SEQ_LM
)

# prepare int-8 model for training - will consume CUDA
model = prepare_model_for_int8_training(model)
# add LoRA adaptor
model = get_peft_model(model, config)
print_trainable_parameters(model)

#Prepare dataset for dialogue summarization

Here longer sequence is truncated & shorter sequence is padded.

Max source length & Max target length is calculated after tokenization, based on length of "input_ids

In [None]:
from datasets import concatenate_datasets
import numpy as np

tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["dialogue"], truncation=True),
                                                                                 batched=True, remove_columns=["dialogue", "summary"])
input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
# take 85 percentile of max length for better utilization
max_source_length = int(np.percentile(input_lenghts, 85))
print(f"Max source length: {max_source_length}")


tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["summary"], truncation=True),
                                                                                  batched=True, remove_columns=["dialogue", "summary"])
target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
# take 90 percentile of max length for better utilization
max_target_length = int(np.percentile(target_lenghts, 90))
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/15551 [00:00<?, ? examples/s]

Max source length: 255


Map:   0%|          | 0/15551 [00:00<?, ? examples/s]

Max target length: 50


In [None]:
# tokenized_inputs = dataset["train"].map(lambda x: tokenizer(x["dialogue"], truncation=True),batched=True,
#                                         remove_columns=["dialogue", "summary"])
# input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
# # take 85 percentile of max length for better utilization
# max_source_length = int(np.percentile(input_lenghts, 85))
# print(f"Max source length: {max_source_length}")


# tokenized_targets = dataset["test"].map(lambda x: tokenizer(x["summary"], truncation=True),batched=True,
#                                         remove_columns=["dialogue", "summary"])
# target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
# # take 90 percentile of max length for better utilization
# max_target_length = int(np.percentile(target_lenghts, 90))
# print(f"Max target length: {max_target_length}")

We preprocess our dataset before training and save it to disk. You could run this step on your local machine or a CPU and upload it to the hub

In [None]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = ["summarize: " + item for item in sample["dialogue"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary", "id"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# save datasets to disk for later easy loading
tokenized_dataset["train"].save_to_disk("data/train")
tokenized_dataset["test"].save_to_disk("data/eval")

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


Saving the dataset (0/1 shards):   0%|          | 0/14732 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/819 [00:00<?, ? examples/s]

We create a DataCollator that will take care of padding our inputs and labels.

In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

The last step is to define the hyperparameters (`TrainingArguments`) we want to use for our training.

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments



# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
		auto_find_batch_size=True,
    learning_rate=1e-3, # higher learning rate
    num_train_epochs=5,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    report_to="tensorboard",
    push_to_hub=True,
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

Let's now train our model and run the cells below. Note that for T5, some layers are kept in `float32` for stability purposes.

In [None]:
# train model
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,1.9293
1000,1.8881
1500,1.9024
2000,1.9012
2500,1.8542
3000,1.8509
3500,1.8581
4000,1.8313
4500,1.8255
5000,1.8089


TrainOutput(global_step=9210, training_loss=1.8202380682565231, metrics={'train_runtime': 3935.7685, 'train_samples_per_second': 18.716, 'train_steps_per_second': 2.34, 'total_flos': 6924203301273600.0, 'train_loss': 1.8202380682565231, 'epoch': 5.0})

In [None]:
# Save our LoRA model & tokenizer results

trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)
# if you want to save the base model to call
# trainer.model.base_model.save_pretrained(peft_model_id)

('results/tokenizer_config.json',
 'results/special_tokens_map.json',
 'results/tokenizer.json')


# Test a single example from test dataset
After the training is done we want to evaluate and test it using rogue score. We can run inference using PEFT and transformers.

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load peft config for pre-trained checkpoint etc.

config = PeftConfig.from_pretrained(peft_model_id)

# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path,  load_in_8bit=True,  device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0})
model.eval()

print("Peft model loaded")

Peft model loaded


In [None]:
from datasets import load_dataset
from random import randrange


# Load dataset from the hub and get a sample
dataset = load_dataset("samsum")
sample = dataset['test'][randrange(len(dataset["test"]))]

input_ids = tokenizer(sample["dialogue"], return_tensors="pt", truncation=True).input_ids.cuda()
# with torch.inference_mode():
outputs = model.generate(input_ids=input_ids, max_new_tokens=10, do_sample=True, top_p=0.9)
print(f"input sentence: \n{sample['dialogue']}\n{'---'* 20}")

print(f"summary:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]}")

input sentence: Richie: Pogba
Clay: Pogboom
Richie: what a s strike yoh!
Clay: was off the seat the moment he chopped the ball back to his right foot
Richie: me too dude
Clay: hope his form lasts
Richie: This season he's more mature
Clay: Yeah, Jose has his trust in him
Richie: everyone does
Clay: yeah, he really deserved to score after his first 60 minutes
Richie: reward
Clay: yeah man
Richie: cool then 
Clay: cool
------------------------------------------------------------
summary:
Pogba was off the seat when Jose


In [None]:
import evaluate
import numpy as np
from datasets import load_from_disk
from tqdm import tqdm

# Metric
metric = evaluate.load("rouge")

def evaluate_peft_model(sample,max_target_length=50):
    # generate summary
    outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
    prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
    # decode eval sample
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
    labels = tokenizer.decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    return prediction, labels

# load test dataset from distk
test_dataset = load_from_disk("data/eval/").with_format("torch")

# run predictions
# this can take ~45 minutes
predictions, references = [] , []
for sample in tqdm(test_dataset):
    p,l = evaluate_peft_model(sample)
    predictions.append(p)
    references.append(l)

# compute metric
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)

# print results
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")

# Rogue1: 50.386161%
# rouge2: 24.842412%
# rougeL: 41.370130%
# rougeLsum: 41.394230%

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

100%|██████████| 819/819 [22:54<00:00,  1.68s/it]


Rogue1: 41.585117%
rouge2: 15.919712%
rougeL: 32.634900%
rougeLsum: 32.610554%


## Save LoRA model & tokenizer results

To save the base model

    trainer.model.base_model.save_pretrained(peft_model_id)

In [None]:
trainer.create_model_card()
model.push_to_hub(f"{user_id}/{peft_model_id}")

adapter_model.bin:   0%|          | 0.00/2.79M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/pritam3355/lora-flan-t5-small/commit/0f55a310700822031d49a496690bd87cbbe259b6', commit_message='Upload model', commit_description='', oid='0f55a310700822031d49a496690bd87cbbe259b6', pr_url=None, pr_revision=None, pr_num=None)

# Inference

In [None]:
peft_model_id = f"{user_id}/{peft_model_id}"
# Load peft config for pre-trained checkpoint etc.

config = PeftConfig.from_pretrained(peft_model_id)

# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path,  load_in_8bit=True,  device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0})
model.eval()

print("Peft model loaded")

# Load dataset from the hub and get a sample
dataset = load_dataset("samsum")
sample = dataset['test'][randrange(len(dataset["test"]))]

input_ids = tokenizer(sample["dialogue"], return_tensors="pt", truncation=True).input_ids.cuda()
# with torch.inference_mode():
outputs = model.generate(input_ids=input_ids, max_new_tokens=10, do_sample=True, top_p=0.9)
print(f"input sentence: \n{sample['dialogue']}\n{'---'* 20}")

print(f"summary:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]}")

Downloading (…)/adapter_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

Downloading adapter_model.bin:   0%|          | 0.00/2.79M [00:00<?, ?B/s]

Peft model loaded
input sentence: Lincoln: Heeyyy ;* whats up
Fatima: I talked to Jenson, he’s not too happy ;p
Lincoln: the place sucks??
Fatima: No, the place is ok, I think, we can go there, it’s about Alene
Lincoln: typical, dont worry about it
Fatima: He thinks she may have a depression :[
Lincoln: nothin new, everyone has it, she needs a doctor then
Fatima: But she won’t go ;/
Lincoln: so she’s destroying her life fuck it its not your problem
Fatima: It is, they’re both my friends!
Lincoln: you better think what to do if they break up
Fatima: Ehh yes Ill have a problem ;//
Lincoln: both blaming each other and talking with you about it, perfect
Fatima: Alene is just troubled… She’d been through a lot…
Lincoln: everyone has their problems, the question is are ya doin sth about them
Fatima: She has problems facing it, don’t be surprised :[
Lincoln: then it is her problem
Fatima: You are so cruel at times… o.O
Lincoln: maybe, for me its just a common sense
Fatima: Why can’t everyone 