# Fine-tuning FLAN-T5-base with LoRA for Text Summarization


## Install Required Packages

In [None]:
#%pip install "peft==0.2.0"
#%pip install "transformers==4.27.1" "datasets==2.9.0" "accelerate==0.17.1" "evaluate==0.4.0" "bitsandbytes==0.37.1" loralib --upgrade --quiet
#%pip install rouge-score tensorboard py7zr 
#%pip install peft.utils
#%pip install -U peft transformers torch bitsandbytes

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × Building wheel for tokenizers (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [49 lines of output]
      running bdist_wheel
      running build
      running build_py
      creating build\lib.win-amd64-cpython-312\tokenizers
      copying py_src\tokenizers\__init__.py -> build\lib.win-amd64-cpython-312\tokenizers
      creating build\lib.win-amd64-cpython-312\tokenizers\models
      copying py_src\tokenizers\models\__init__.py -> build\lib.win-amd64-cpython-312\tokenizers\models
      creating build\lib.win-amd64-cpython-312\tokenizers\decoders
      copying py_src\tokenizers\decoders\__init__.py -> build\lib.win-amd64-cpython-312\tokenizers\decoders
      creating build\lib.win-amd64-cpython-312\tokenizers\normalizers
      copying py_src\tokenizers\normalizers\__init__.py -> build\lib.win-amd64-cpython-312\tokenizers\normalizers
      creating build\lib.win-amd64-cpython-312\tokenizers\pre_tokenizers
      copying py_src

Note: you may need to restart the kernel to use updated packages.


## Import Required Libraries

In [None]:
import torch
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, TrainingArguments, Trainer , Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType
from sklearn.model_selection import train_test_split
import json
import evaluate
import numpy as np
from datasets import load_from_disk
from tqdm import tqdm
from datasets import load_dataset 
from random import randrange



## Load and Prepare Dataset
Load dataset from JSON file


In [None]:
with open('./scientific_papers_dataset.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

Create Dataset object

In [None]:
dataset = Dataset.from_dict({
    'article': [item['article'] for item in data],
    'summary': [item['summary'] for item in data]
})


Split dataset into train and test sets

In [None]:
dataset = dataset.train_test_split(test_size=0.2, seed=42)

Print dataset sizes

In [None]:
print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

Train dataset size: 472
Test dataset size: 118


## Setup Device and Model

Check for GPU availability

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


Set model ID and load tokenizer

In [None]:
model_id="google/flan-t5-base"

# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

## Preprocess Data

Set maximum lengths for input and target

In [None]:
max_input_length = 512
max_target_length = 512

def preprocess_function(sample,padding="max_length"):
    """
    Preprocess the data by tokenizing and formatting it correctly for the model.

    Args:
        sample (dict): A dictionary containing the original data.
        padding (str): A string indicating the padding method.

    Returns:
        dict: A dictionary containing the tokenized data.
    """
    # add prefix to the input for t5
    inputs = ["summarize: " + item for item in sample["article"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)

    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["article", "summary"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# save datasets to disk
tokenized_dataset["train"].save_to_disk("../practice/lora-flan-t5-base/data/train")
tokenized_dataset["test"].save_to_disk("../practice/lora-flan-t5-base/data/eval")

Map: 100%|██████████| 472/472 [00:10<00:00, 46.07 examples/s]
Map: 100%|██████████| 118/118 [00:01<00:00, 64.23 examples/s]


Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


Saving the dataset (1/1 shards): 100%|██████████| 472/472 [00:00<00:00, 59000.76 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 118/118 [00:00<00:00, 19654.03 examples/s]


## Initialize Model with LoRA Configuration

Load base model

In [None]:
model_id = "google/flan-t5-base"

model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Configure LoRA

r=16: Defines the rank of the LoRA update matrices. A lower rank means fewer trainable parameters, reducing memory usage.

lora_alpha=32: Scaling factor that helps balance the impact of LoRA weights.

target_modules=["q", "v"]: Specifies which layers to apply LoRA to:

    "q" (query) and "v" (value) layers in the attention mechanism are modified.

lora_dropout=0.05: Introduces dropout to prevent overfitting.

bias="none": Ensures that only LoRA weights are modified.

task_type=TaskType.SEQ_2_SEQ_LM: Specifies that the model is for sequence-to-sequence learning (like summarization).

In [None]:

from peft import LoraConfig, get_peft_model, TaskType
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,769,472 || all params: 249,347,328 || trainable%: 0.7096


## Setup Training Configuration

Configure data collator

In [None]:
from transformers import DataCollatorForSeq2Seq

label_pad_token_id = -100

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

Define training argument and Initialize trainer

In [None]:
output_dir="lora-flan-t5-base"

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
		auto_find_batch_size=True,
    learning_rate=1e-3, # higher learning rate
    num_train_epochs=5,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    report_to="tensorboard",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
)
model.config.use_cache = False

 ## Train and Save Model
 Train the model

In [None]:
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss


TrainOutput(global_step=295, training_loss=2.222763837394068, metrics={'train_runtime': 703.4189, 'train_samples_per_second': 3.355, 'train_steps_per_second': 0.419, 'total_flos': 1628855882219520.0, 'train_loss': 2.222763837394068, 'epoch': 5.0})

Save the model

In [None]:
peft_model_id="../practice/lora-flan-t5-base/model"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

('results\\tokenizer_config.json',
 'results\\special_tokens_map.json',
 'results\\tokenizer.json')

## Load and Test Trained Model

Load the trained model

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load peft config for pre-trained checkpoint etc. 
peft_model_id = "../practice/lora-flan-t5-base/model"
config = PeftConfig.from_pretrained(peft_model_id)

# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path,  load_in_8bit=True,  device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0})
model.eval()

print("Peft model loaded")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Peft model loaded


## Evaluate Model Performance

Load ROUGE metric

In [None]:

metric = evaluate.load("rouge")

def evaluate_peft_model(sample,max_target_length=512):
    """
    Evaluate the model on a given sample.

    Args:
        sample (dict): A dictionary containing the original data.
        max_target_length (int): The maximum length of the target sequence.

    Returns:
        tuple: A tuple containing the prediction and the reference.
    """
    
    # generate summary
    outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)    
    prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
    # decode eval sample
    labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
    labels = tokenizer.decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    return prediction, labels

# load test dataset from distk
test_dataset = load_from_disk("../practice/lora-flan-t5-base/data/eval/").with_format("torch")

# run predictions
predictions, references = [] , []
for sample in tqdm(test_dataset):
    p,l = evaluate_peft_model(sample)
    predictions.append(p)
    references.append(l)

# compute metric 
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)

# print results 
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")




100%|██████████| 118/118 [57:09<00:00, 29.06s/it] 


Rogue1: 50.101581%
rouge2: 20.299048%
rougeL: 29.766023%
rougeLsum: 29.769431%
