# Optimized Fine-tuning FLAN-T5-large with LoRA for Text Summarization

optimized for memory efficiency on a 45GB RTX 8000 GPU, specifically for large inputs and outputs/

## Setup and Imports

In [None]:
#!pip install peft transformers datasets accelerate evaluate bitsandbytes loralib rouge-score tensorboard py7zr -q

In [4]:
#!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Collecting scipy>=1.6.0
  Downloading scipy-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.6/37.6 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.6.1 scipy-1.15.2 threadpoolctl-3.5.0


In [1]:
import torch
import gc
import os
import json
import numpy as np
from tqdm import tqdm
from datasets import Dataset, load_from_disk
from transformers import (
    AutoModelForSeq2SeqLM, 
    AutoTokenizer, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel, PeftConfig
from sklearn.model_selection import train_test_split
import evaluate
from huggingface_hub import notebook_login

## Memory Configuration

In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TRANSFORMERS_ENABLE_GRAD_CHECKPOINT"] = "true"

torch.manual_seed(42)
np.random.seed(42)

## Check GPU and Device

In [3]:
# Check for CUDA availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Check available GPU memory
if device == 'cuda':
    print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"Available GPU memory: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")

Using device: cuda
Total GPU memory: 47.76 GB
Available GPU memory: 0.00 GB


## Hugging Face Login

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Configure Parameters
These parameters can be adjusted based on your memory constraints

In [None]:
MAX_INPUT_LENGTH = 9500  
MAX_TARGET_LENGTH = 1200  
BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = 8
EPOCHS = 1.5

print(f"Using max input length: {MAX_INPUT_LENGTH}, max target length: {MAX_TARGET_LENGTH}")
print(f"Batch size: {BATCH_SIZE}, Gradient accumulation steps: {GRADIENT_ACCUMULATION_STEPS}")

Using max input length: 9500, max target length: 1200
Batch size: 1, Gradient accumulation steps: 8


## Load and Prepare Dataset

In [6]:
def load_data():
    """Load dataset from JSON file"""
    try:
        with open('./Dataset_2.json', 'r', encoding='utf-8') as file:
            data = json.load(file)
            
        dataset = Dataset.from_dict({
            'article': [item['article'] for item in data],
            'summary': [item['summary'] for item in data]
        })
        
        # Splitting dataset
        dataset = dataset.train_test_split(test_size=0.2, seed=42)
        
        print(f"Train dataset size: {len(dataset['train'])}")
        print(f"Test dataset size: {len(dataset['test'])}")
        
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        raise

In [7]:
# Load the dataset
dataset = load_data()

Train dataset size: 790
Test dataset size: 198


## Load Model and Tokenizer

In [8]:
def load_model_and_tokenizer():
    """Load model and tokenizer"""
    model_id = "google/flan-t5-large"
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Load model with 8-bit quantization to save memory
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_id,
        load_in_8bit=True,  # Use 8-bit quantization to reduce memory usage
        device_map="auto"
    )
    
    print("Model and tokenizer loaded successfully")
    return model, tokenizer

In [9]:
# Load model and tokenizer
model, tokenizer = load_model_and_tokenizer()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Model and tokenizer loaded successfully


## Preprocess and Tokenize Dataset

In [None]:
def preprocess_function(sample, tokenizer, padding="max_length"):
    """
    Preprocess the data by tokenizing and formatting it correctly for the model.
    """
    inputs = ["summarize: " + item for item in sample["article"]]

    # Tokenize inputs
    model_inputs = tokenizer(
        inputs, 
        max_length=MAX_INPUT_LENGTH, 
        padding=padding, 
        truncation=True
    )

    # Tokenize targets
    labels = tokenizer(
        text_target=sample["summary"], 
        max_length=MAX_TARGET_LENGTH, 
        padding=padding, 
        truncation=True
    )

    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] 
            for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
def prepare_dataset(dataset, tokenizer):
    """Tokenize and prepare the dataset"""
    tokenized_dataset = dataset.map(
        lambda x: preprocess_function(x, tokenizer), 
        batched=True, 
        remove_columns=["article", "summary"]
    )
    
    print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
    
    # Save datasets to disk
    os.makedirs("flan-t5-lora-optimized/data/train", exist_ok=True)
    os.makedirs("flan-t5-lora-optimized/data/eval", exist_ok=True)
    
    tokenized_dataset["train"].save_to_disk("flan-t5-lora-optimized/data/train")
    tokenized_dataset["test"].save_to_disk("flan-t5-lora-optimized/data/eval")
    
    return tokenized_dataset

In [None]:
tokenized_dataset = prepare_dataset(dataset, tokenizer)

Map:   0%|          | 0/790 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


Saving the dataset (0/1 shards):   0%|          | 0/790 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/198 [00:00<?, ? examples/s]

## Set up LoRA Configuration

In [None]:
def setup_lora_config(model):
    """Set up LoRA configuration and apply it to the model"""
    # Configure LoRA for parameter-efficient fine-tuning
    lora_config = LoraConfig(
        r=16,  # Rank
        lora_alpha=32,
        target_modules=["q", "v"], 
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.SEQ_2_SEQ_LM
    )
    
    # Apply LoRA config to model
    model = get_peft_model(model, lora_config)
    
    # Enable input gradients for LoRA training
    model.enable_input_require_grads()
    
    model.print_trainable_parameters()
    
    return model

In [None]:
model = setup_lora_config(model)

trainable params: 4,718,592 || all params: 787,868,672 || trainable%: 0.5989


## Set up Training

In [21]:
def setup_training(model, tokenized_dataset, tokenizer, resume_from_checkpoint=None):
    """Set up training configuration and data collator"""
    label_pad_token_id = -100
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=label_pad_token_id,
        pad_to_multiple_of=8
    )
    
    torch.cuda.empty_cache()
    gc.collect()
    
    output_dir = "flan-t5-lora-optimized"
    
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        learning_rate=1e-3,
        num_train_epochs=EPOCHS,
        logging_dir=f"{output_dir}/logs",
        logging_strategy="steps",
        logging_steps=100,
        save_strategy="epoch",
        save_total_limit=2,
        eval_strategy="epoch",
        eval_accumulation_steps=4,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        bf16=True,
        fp16=False if torch.cuda.is_bf16_supported() else True,
        gradient_checkpointing=True,
        report_to="tensorboard",
    )
    
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
    )
    
    model.config.use_cache = False
    
    return trainer, resume_from_checkpoint


In [23]:

from transformers.trainer_utils import get_last_checkpoint

trainer, resume_from_checkpoint = setup_training(model, tokenized_dataset, tokenizer)

# Get latest checkpoint
last_checkpoint = get_last_checkpoint("flan-t5-lora-optimized")
if last_checkpoint:
    print(f"Resuming from checkpoint: {last_checkpoint}")
    resume_from_checkpoint = last_checkpoint

trainer.train(resume_from_checkpoint=resume_from_checkpoint)


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Resuming from checkpoint: flan-t5-lora-optimized/checkpoint-99




Epoch,Training Loss,Validation Loss
1,1.8614,1.550673




TrainOutput(global_step=147, training_loss=0.5773324050059935, metrics={'train_runtime': 10494.7955, 'train_samples_per_second': 0.113, 'train_steps_per_second': 0.014, 'total_flos': 5.054225916336538e+16, 'train_loss': 0.5773324050059935, 'epoch': 1.4962025316455696})

## Save the Model

In [None]:
# Save model and tokenizer
peft_model_id = "flan-t5-lora-summarization-optimized"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

trainer.model.push_to_hub(peft_model_id)
tokenizer.push_to_hub(peft_model_id)

adapter_model.safetensors:   0%|          | 0.00/18.9M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/KASHU101/flan-t5-lora-summarization-optimized/commit/8733cd5319f952266bd60029ccbbf381bf329bbd', commit_message='Upload tokenizer', commit_description='', oid='8733cd5319f952266bd60029ccbbf381bf329bbd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/KASHU101/flan-t5-lora-summarization-optimized', endpoint='https://huggingface.co', repo_type='model', repo_id='KASHU101/flan-t5-lora-summarization-optimized'), pr_revision=None, pr_num=None)

In [None]:
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file
import torch
from huggingface_hub import upload_file


safetensors_path = hf_hub_download(
    repo_id="KASHU101/flan-t5-lora-summarization-optimized",
    filename="adapter_model.safetensors"
)

weights = load_file(safetensors_path)
bin_path = "adapter_model.bin"
torch.save(weights, bin_path)


upload_file(
    path_or_fileobj=bin_path,
    path_in_repo="adapter_model.bin",
    repo_id="KASHU101/flan-t5-lora-summarization-optimized",
    token="token" # hugging face token here
)

adapter_model.safetensors:   0%|          | 0.00/18.9M [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/19.0M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/KASHU101/flan-t5-lora-summarization-optimized/commit/251a21564c2bfe344eede08e9e3924242a048a66', commit_message='Upload adapter_model.bin with huggingface_hub', commit_description='', oid='251a21564c2bfe344eede08e9e3924242a048a66', pr_url=None, repo_url=RepoUrl('https://huggingface.co/KASHU101/flan-t5-lora-summarization-optimized', endpoint='https://huggingface.co', repo_type='model', repo_id='KASHU101/flan-t5-lora-summarization-optimized'), pr_revision=None, pr_num=None)

## Evaluate the Model

In [39]:
def evaluate_model(model, tokenizer):
    """Evaluate the model and compute ROUGE scores"""
    print("Evaluating model...")
    metric = evaluate.load("rouge")
    
    def evaluate_peft_model(sample, max_target_length=MAX_TARGET_LENGTH):
        """Evaluate the model on a given sample."""
        # Generate summary
        with torch.no_grad():
            outputs = model.generate(
                input_ids=sample["input_ids"].unsqueeze(0).to(device), 
                do_sample=True, 
                top_p=0.9, 
                max_new_tokens=max_target_length
            )
            
        prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
        
        # Decode eval sample
        labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
        labels = tokenizer.decode(labels, skip_special_tokens=True)
        
        return prediction, labels
    
    # Load test dataset
    test_dataset = load_from_disk("flan-t5-lora-optimized/data/eval/").with_format("torch")
    
    # Run predictions on a subset to save time during evaluation
    eval_size = min(30, len(test_dataset))
    print(f"Evaluating on {eval_size} samples...")
    
    predictions, references = [], []
    for i, sample in enumerate(tqdm(test_dataset.select(range(eval_size)))):
        p, l = evaluate_peft_model(sample)
        predictions.append(p)
        references.append(l)
    
    # Compute metrics
    rouge = metric.compute(predictions=predictions, references=references, use_stemmer=True)
    
    # Print results
    print(f"ROUGE-1: {rouge['rouge1'] * 100:.2f}%")
    print(f"ROUGE-2: {rouge['rouge2'] * 100:.2f}%")
    print(f"ROUGE-L: {rouge['rougeL'] * 100:.2f}%")
    print(f"ROUGE-Lsum: {rouge['rougeLsum'] * 100:.2f}%")
    
    return rouge

In [40]:
# Evaluate the model
rouge_scores = evaluate_model(model, tokenizer)

Evaluating model...
Evaluating on 30 samples...


100%|██████████| 30/30 [1:21:29<00:00, 162.98s/it]


ROUGE-1: 55.18%
ROUGE-2: 22.03%
ROUGE-L: 23.69%
ROUGE-Lsum: 23.65%


## Zip and Save Results

In [28]:
# Save everything as a zip file
!zip -r flan-t5-lora-optimized.zip flan-t5-lora-optimized/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: flan-t5-lora-optimized/ (stored 0%)
  adding: flan-t5-lora-optimized/data/ (stored 0%)
  adding: flan-t5-lora-optimized/data/train/ (stored 0%)
 (deflated 81%)t5-lora-optimized/data/train/data-00000-of-00001.arrow
  adding: flan-t5-lora-optimized/data/train/state.json (deflated 38%)
  adding: flan-t5-lora-optimized/data/train/dataset_info.json (deflated 69%)
  adding: flan-t5-lora-optimized/data/eval/ (stored 0%)
 (deflated 80%)t5-lora-optimized/data/eval/data-00000-of-00001.arrow
  adding: flan-t5-lora-optimized/data/eval/state.json (deflated 39%)
  adding: flan-t5-lora-optimized/data/eval/dataset_info.json (deflated 69%)
  adding: flan-t5-lora-optimized/logs/ (stored 0%)
  adding: flan-t5-lora-optimized/logs/events.out.tfevents.1741640813.a4ac8b5928f5.1154.0 (deflated 62%)
  adding: flan-t5-lora-optimized/logs/events.out.tfevents.1741640956.a4ac8b5928f5.1385.0 (deflated 62%)
  adding: flan-t5-lora-optimized/logs/events.out.tfevents.1741641042.a4ac8b5928f5.1385.1 (deflated 6

## Inference with the Fine-tuned Model

In [29]:
def load_finetuned_model(model_path):
    """Load the fine-tuned model for inference"""
    # Load base model
    base_model = AutoModelForSeq2SeqLM.from_pretrained(
        "google/flan-t5-large",
        load_in_8bit=True,
        device_map="auto"
    )
    
    # Load fine-tuned model
    model = PeftModel.from_pretrained(base_model, model_path)
    model.eval()
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
    
    return model, tokenizer

In [36]:
def generate_summary(text, model, tokenizer):
    """Generate a summary for the given text"""
    # Prepare the input
    inputs = tokenizer(
        "summarize: " + text,
        return_tensors="pt", 
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    ).to(device)
    
    # Generate summary
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            max_new_tokens=MAX_TARGET_LENGTH,
            do_sample=True,
            top_p=0.9,
            num_return_sequences=1
        )
    
    # Decode the output
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return summary

In [None]:
sample_text = """
RESEARCH PAPER TEXT
"""

In [None]:
# Load the fine-tuned model
inference_model, inference_tokenizer = load_finetuned_model("flan-t5-lora-summarization-optimized")

summary = generate_summary(sample_text, inference_model, inference_tokenizer)
print(summary)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Here's a comprehensive summary of the provided research paper, covering the requested elements: **1. Objective:** * **Main Goal:** The primary objective of this research is to provide a mathematical explanation of UNet, the widely popular deep neural network architecture for image segmentation, by showing the architecture is exactly the model it originated from. * **Significance:** The authors provide a mathematical explanation of UNet, a network created in the UNet architecture, for solving a control problem. They define a control problem describing the dynamics of a neural network and propose a control splitting algorithm to solve it. * **Significance:** The analysis provides a mathematical explanation of how a UNet network is solved. They intend to directly link UNet to a network-specific framework, providing a more comprehensive and specific explanation for this algorithm than traditional algorithms that use generic techniques such as equational methods or recursion. * **Mathematic