# Fine-tuning FLAN-T5-large with LoRA for Text Summarization


## Import Required Libraries

In [None]:
#%pip install "peft==0.2.0"
#%pip install "transformers==4.27.1" "datasets==2.9.0" "accelerate==0.17.1" "evaluate==0.4.0" "bitsandbytes==0.37.1" loralib --upgrade --quiet
#%pip install rouge-score tensorboard py7zr
#%pip install peft.utils
#%pip install -U peft transformers torch bitsandbytes

In [None]:
#!pip uninstall torch torchvision torchaudio
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
import torch
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, TrainingArguments, Trainer , Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.model_selection import train_test_split
import json
import evaluate
import numpy as np
from datasets import load_from_disk
from tqdm import tqdm
from datasets import load_dataset
from random import randrange

## Load and Prepare Dataset
Load dataset from JSON file


In [None]:
with open('./scientific_papers_dataset.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

Create Dataset object

In [None]:
dataset = Dataset.from_dict({
    'article': [item['article'] for item in data],
    'summary': [item['summary'] for item in data]
})


In [None]:
dataset = dataset.train_test_split(test_size=0.2, seed=42)

In [None]:
print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

Train dataset size: 472
Test dataset size: 118


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


In [None]:
model_id="google/flan-t5-large"

# Load tokenizer of FLAN-t5-large
tokenizer = AutoTokenizer.from_pretrained(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
max_input_length = 1024
max_target_length = 512

def preprocess_function(sample,padding="max_length"):
    """
    Preprocess the data by tokenizing and formatting it correctly for the model.

    Args:
        sample (dict): A dictionary containing the original data.
        padding (str): A string indicating the padding method.

    Returns:
        dict: A dictionary containing the tokenized data.
    """
    # add prefix to the input for t5
    inputs = ["summarize: " + item for item in sample["article"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)

    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["article", "summary"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# save datasets to disk
import os

# Create the directory if it doesn't exist
os.makedirs("lora-flan-t5-large/data/train", exist_ok=True)
os.makedirs("lora-flan-t5-large/data/eval", exist_ok=True)

# save datasets to disk
tokenized_dataset["train"].save_to_disk("lora-flan-t5-large/data/train")
tokenized_dataset["test"].save_to_disk("lora-flan-t5-large/data/eval")

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


Saving the dataset (0/1 shards):   0%|          | 0/472 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/118 [00:00<?, ? examples/s]

In [None]:
model_id = "google/flan-t5-large"

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_id,
    device_map="auto")

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
from peft import LoraConfig, get_peft_model, TaskType
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

# Prepare model for training
model = get_peft_model(model, lora_config)
print("Trainable parameters:", model.print_trainable_parameters())

trainable params: 4,718,592 || all params: 787,868,672 || trainable%: 0.5989
Trainable parameters: None


In [None]:
from transformers import DataCollatorForSeq2Seq

label_pad_token_id = -100

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
model.enable_input_require_grads()

In [None]:
output_dir="lora-flan-t5-large"

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
		auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=5,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    eval_dataset=tokenized_dataset["test"],
    train_dataset=tokenized_dataset["train"],
)
model.config.use_cache = False

In [None]:
trainer.train()

Step,Training Loss
500,1.8049
1000,1.6438
1500,1.5154
2000,1.417


TrainOutput(global_step=2360, training_loss=1.55890985262596, metrics={'train_runtime': 3213.8532, 'train_samples_per_second': 0.734, 'train_steps_per_second': 0.734, 'total_flos': 1.094692355702784e+16, 'train_loss': 1.55890985262596, 'epoch': 5.0})

In [None]:
peft_model_id="lora-flan-t5-large/model"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load peft config for pre-trained checkpoint etc.
peft_model_id = "lora-flan-t5-large/model"
config = PeftConfig.from_pretrained(peft_model_id)

# load large LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.large_model_name_or_path,  device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(config.large_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)
model.eval()

print("Peft model loaded")



Peft model loaded


In [None]:

metric = evaluate.load("rouge")

def evaluate_peft_model(sample,max_target_length=512):
    """
    Evaluate the model on a given sample.

    Args:
        sample (dict): A dictionary containing the original data.
        max_target_length (int): The maximum length of the target sequence.

    Returns:
        tuple: A tuple containing the prediction and the reference.
    """

    # generate summary
    outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
    prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
    # decode eval sample
    labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
    labels = tokenizer.decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    return prediction, labels

# load test dataset from distk
test_dataset = load_from_disk("lora-flan-t5-large/data/eval/").with_format("torch")

# run predictions
predictions, references = [] , []
for sample in tqdm(test_dataset):
    p,l = evaluate_peft_model(sample)
    predictions.append(p)
    references.append(l)

# compute metric
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)

# print results
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")




Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

100%|██████████| 118/118 [28:16<00:00, 14.38s/it]


Rogue1: 54.121916%
rouge2: 24.078767%
rougeL: 32.542274%
rougeLsum: 32.536183%
