# PEFT Qwen 2.5 SLM


## Imports

In [1]:
!pip install \
  "datasets>=3.4.1,<4.4.0" \
  "trl>=0.18.2,<=0.24.0" \
  unsloth \
  wandb \
  rouge-score \
  bert-score \
  nltk \
  evaluate \
  tqdm

Collecting trl<=0.24.0,>=0.18.2
  Downloading trl-0.24.0-py3-none-any.whl.metadata (11 kB)
Collecting unsloth
  Downloading unsloth-2025.12.6-py3-none-any.whl.metadata (65 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.9/65.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting unsloth_zoo>=2025.12.5 (from unsloth)
  Downloading unsloth_zoo-2025.12.5-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-1.0.2-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.33.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth)
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets<4.4.0,>=3.4.1
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting pyarrow>=21.0.0 (from datasets<4.4.0,>=3.4.1)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_

In [2]:
import os
import random as rd

import torch
import evaluate
import nltk
import wandb
from textwrap import fill
from bert_score import score as bertscore
from datasets import load_dataset
from google.colab import drive
from peft import LoftQConfig
from transformers import TrainingArguments
from trl import SFTTrainer
from tqdm import tqdm
from unsloth import FastLanguageModel, is_bfloat16_supported, get_chat_template

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
drive.mount("/content/drive")

Mounted at /content/drive


## GPU constraints

In [4]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
0.0 GB of memory reserved.


## Loading the model and tokeniser

In [5]:
max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-1.5B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)


==((====))==  Unsloth 2025.12.6: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

## QLora Finetuning

In [7]:
loftq_config = LoftQConfig(
    loftq_bits=4,
    loftq_iter=1,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_alpha=16,
    lora_dropout=0.0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
    use_rslora = False,
    loftq_config = loftq_config,
)

Unsloth 2025.12.6 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


## Dataset Preprocessing

In [12]:
SYSTEM_PROMPT = (
    "You are an expert news summarization assistant. "
    "Summarize news articles faithfully and concisely "
    "using only the information explicitly stated."
)

def formatting_prompts_func(examples):
    documents = examples["document"]
    summaries = examples["summary"]

    texts = []

    for doc, summ in zip(documents, summaries):
        convo = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {
                "role": "user",
                "content": (
                    "Summarize the following news article in at most 3 sentences.\n\n"
                    f"ARTICLE:\n{doc}"
                ),
            },
            {"role": "assistant", "content": summ},
        ]

        text = tokenizer.apply_chat_template(
            convo,
            tokenize=False,
            add_generation_prompt=False,
        )

        texts.append(text)

    return {"text": texts}


In [13]:
from datasets import Dataset

dataset = Dataset.from_file("cnn_data.arrow")

dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=dataset.column_names
)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [14]:
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_val = dataset["train"]
test_dataset = dataset["test"]

train_val = train_val.train_test_split(test_size=0.05, seed=42)
train_dataset = train_val["train"]
val_dataset   = train_val["test"]

print("Train dataset length", len(train_dataset))
print("Validation dataset length", len(val_dataset))
print("Test dataset length", len(test_dataset))
print(train_dataset[0])


Train dataset length 4275
Validation dataset length 225
Test dataset length 500
{'text': '<|im_start|>system\nYou are an expert news summarization assistant. Summarize news articles faithfully and concisely using only the information explicitly stated.<|im_end|>\n<|im_start|>user\nSummarize the following news article in at most 3 sentences.\n\nARTICLE:\n(CNN) -- The University of California San Diego has suspended a student who admitted to hanging a noose in a campus library, school officials announced Friday. "We are feeling real pain, and we will take real action," said UCSD chancellor Marye Anne Fox told reporters. "The safety of our students, faculty, and staff is my primary concern." The student, whose identity was not released, admitted Friday to police at the University of California San Diego that she hung a noose Thursday night in the library, police said. "Detectives have interviewed the student and taken a statement," UCSD police said in a release. "The investigation is ongo

In [15]:
wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)
wandb: (1) Create a W&B account
wandb: (2) Use an existing W&B account
wandb: (3) Don't visualize my results
wandb: Enter your choice:

 1


wandb: You chose 'Create a W&B account'
wandb: Create an account here: https://wandb.ai/authorize?signup=true
wandb: Paste an API key from your profile and hit enter:

 ··········


wandb: No netrc file found, creating one.
wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc
wandb: Currently logged in as: marius-dragic (marius-dragic-centralesup-lec) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


True

## Fine-Tuning Setup

In [17]:
import wandb
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only

OUTPUT_DIR = "/content/drive/MyDrive/DASCIM/qwen_summarizer_lora"

wandb.init(
    project="qwen2.5-summarization",
    name="qwen2.5-lora-unsloth",
    config={
        "model": "Qwen2.5",
        "task": "summarization",
        "lora_r": 16,
        "learning_rate": 2e-4,
        "per_device_batch_size": 1,
        "gradient_accumulation": 8,
        "effective_batch_size": 8,
        "epochs": 3,
        "optimizer": "adamw_8bit",
        "scheduler": "cosine",
        "packing": False,
    },
)


trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    packing=False,  # IMPORTANT for summarization
    args=TrainingArguments(
        output_dir=OUTPUT_DIR,

        # Batching
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=8,

        # Training length
        num_train_epochs=3,

        # Optimization
        learning_rate=2e-4,
        warmup_ratio=0.03,
        lr_scheduler_type="cosine",
        optim="adamw_8bit",
        weight_decay=0.01,

        # Precision
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),

        # Logging / eval / save
        logging_steps=10,
        eval_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=200,
        save_total_limit=5,

        # Best model
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,

        # Misc
        seed=42,
        report_to="wandb",
        save_safetensors=True,
    ),
)


trainer = train_on_responses_only(
    trainer,
    instruction_part="<|im_start|>user\n",
    response_part="<|im_start|>assistant\n",
)



Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/4275 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/225 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/4275 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/225 [00:00<?, ? examples/s]

## Training

In [19]:
trainer_stats = trainer.train()


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,275 | Num Epochs = 3 | Total steps = 1,605
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Step,Training Loss,Validation Loss
100,0.9917,0.969685
200,0.9309,0.928407
300,0.9518,0.906815
400,0.8766,0.892149
500,0.8922,0.876048
600,0.705,0.892992
700,0.6839,0.892005
800,0.7072,0.893838
900,0.6832,0.889031
1000,0.6794,0.876677


Unsloth: Not an error, but Qwen2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


Step,Training Loss,Validation Loss
100,0.9917,0.969685
200,0.9309,0.928407
300,0.9518,0.906815
400,0.8766,0.892149
500,0.8922,0.876048
600,0.705,0.892992
700,0.6839,0.892005
800,0.7072,0.893838
900,0.6832,0.889031
1000,0.6794,0.876677


KeyboardInterrupt: 

## Memory usage during fine-tuning

In [20]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)

print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

NameError: name 'trainer_stats' is not defined

## Evaluation (Rouge, Meteor, BERTScore)

In [None]:
nltk.download("wordnet")
nltk.download("omw-1.4")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

tokenizer = get_chat_template(
    tokenizer,
    chat_template="qwen-2.5",
)

In [22]:
from peft import PeftModel

LORA_CHECKPOINT = 'checkpoint-400'

model = PeftModel.from_pretrained(
    model,
    LORA_CHECKPOINT,
    torch_dtype=torch.float16,
)

model = model.merge_and_unload()




In [25]:
!pip install tqdm
from tqdm import tqdm

def generate_summary(
    document,
    max_new_tokens=128,
):
    messages = [
        {
            "role": "user",
            "content": (
                "Summarize the following news article in at most 3 sentences.\n\n"
                f"ARTICLE:\n{document}"
            ),
        }
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=max_new_tokens,
            use_cache=True,
            do_sample=False,
        )

    decoded = tokenizer.decode(
        outputs[0][inputs.shape[-1]:],
        skip_special_tokens=True,
    )

    return decoded.strip()


MAX_SAMPLES = len(test_dataset)  # change or set to len(test_dataset)

predictions = []
references = []

for example in tqdm(test_dataset.select(range(MAX_SAMPLES)), desc="Evaluating"):
    document = example["document"]
    reference = example["summary"]

    pred = generate_summary(document)

    predictions.append(pred)
    references.append(reference)



Evaluating:   0%|          | 0/500 [00:00<?, ?it/s]


KeyError: 'document'

## Scores

In [None]:
rouge_results = rouge.compute(
    predictions=predictions,
    references=references,
)

meteor_results = meteor.compute(
    predictions=predictions,
    references=references,
)

P, R, F1 = bertscore(
    predictions,
    references,
    lang="en",
    model_type="microsoft/deberta-xlarge-mnli",
    device="cuda",
    rescale_with_baseline=True,
)

print("ROUGE scores:")
for k, v in rouge_results.items():
    print(f"{k}: {v:.4f}")

print(f"\nMETEOR: {meteor_results['meteor']:.4f}")
print("\nBERTScore:")
print(f"Precision: {P.mean().item():.4f}")
print(f"Recall:    {R.mean().item():.4f}")
print(f"F1:        {F1.mean().item():.4f}")


## Summary Example

In [33]:
import random as rd
from textwrap import fill

def generate_summary(
    document,
    max_new_tokens=128,
):
    messages = [
        {
            "role": "system",
            "content": (
                "You are an expert news summarization assistant. "
                "Summarize news articles faithfully and concisely "
                "using only the information explicitly stated."
            ),
        },
        {
            "role": "user",
            "content": (
                "Summarize the following news article in at most 3 sentences.\n\n"
                f"ARTICLE:\n{document}"
            ),
        },
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=max_new_tokens,
            use_cache=True,
            do_sample=False,   # deterministic for eval
        )

    decoded = tokenizer.decode(
        outputs[0],
        skip_special_tokens=True,
    )

    # Extract only assistant answer
    if "assistant" in decoded:
        decoded = decoded.split("assistant")[-1]

    return decoded.strip()

def parse_chat_example(text):
    """
    Extrait l'article et le résumé de référence depuis le champ 'text'
    """
    # Extraire la partie ARTICLE
    article = text.split("ARTICLE:\n")[1].split("<|im_end|>")[0].strip()

    # Extraire la réponse assistant (gold summary)
    gold_summary = text.split("<|im_start|>assistant\n")[1].split("<|im_end|>")[0].strip()

    return article, gold_summary
random_example = rd.choice(test_dataset)

# Parse dataset text
article, gold_summary = parse_chat_example(random_example["text"])

# Generate model summary
model_summary = generate_summary(article)

print("=" * 100)
print("📰 ARTICLE:\n")
print(fill(article, 100))

print("\n" + "=" * 100)
print("✍️ GOLD SUMMARY:\n")
print(fill(gold_summary, 100))

print("\n" + "=" * 100)
print("🤖 MODEL SUMMARY (LoRA Fine-Tuned):\n")
print(fill(model_summary, 100))
print("=" * 100)



📰 ARTICLE:

(CNN) -- A former Alabama university professor accused of gunning down three colleagues in February
was indicted in Massachusetts on Wednesday in the 1986 shooting death of her brother. Amy Bishop was
charged with first-degree murder in the killing of her brother, Seth Bishop, Norfolk District
Attorney William Keating said. The brother's death originally was ruled an accident. His death came
under renewed scrutiny after Amy Bishop was arrested February 12 in a shooting rampage at a biology
faculty meeting at the University of Alabama-Huntsville. At the time of the 1986 killing, Bishop,
who was 21, told authorities she had asked for her 18-year-old brother's help unloading a shotgun
when it accidentally discharged. Keating acknowledged Wednesday mistakes in handling the case,
saying "jobs weren't done, responsibilities were not met and justice was not served." "Three
individuals who were killed in Alabama may not have been" had Bishop been charged in her brother's
death, Kea

In [35]:
print(generate_summary(train_dataset[0]["text"][:1000]))


!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


In [43]:
# ===============================
# Load Qwen 2.5 + LoRA (checkpoint-400)
# ===============================

import torch
import random
from textwrap import fill

from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from peft import PeftModel

# Paths
BASE_MODEL = "unsloth/Qwen2.5-1.5B"
LORA_CHECKPOINT = "/content/drive/MyDrive/DASCIM/qwen_summarizer_lora/checkpoint-1000"

# Model loading
max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Load LoRA adapter
model = PeftModel.from_pretrained(
    model,
    LORA_CHECKPOINT,
    torch_dtype=torch.float16,
)
# Apply Qwen chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="qwen-2.5",
)

# Enable fast inference
FastLanguageModel.for_inference(model)

model.eval()

# ===============================
# Helper: parse test dataset example
# ===============================

def parse_chat_example(text):
    article = text.split("ARTICLE:\n")[1].split("<|im_end|>")[0].strip()
    gold_summary = text.split("<|im_start|>assistant\n")[1].split("<|im_end|>")[0].strip()
    return article, gold_summary


# ===============================
# Generate summary
# ===============================

def generate_summary(document, max_new_tokens=1024):
    messages = [
        {
            "role": "system",
            "content": (
                "You are an expert news summarization assistant. "
                "Summarize news articles faithfully and concisely "
                "using only the information explicitly stated."
            ),
        },
        {
            "role": "user",
            "content": (
                "Summarize the following news article in at most 3 sentences.\n\n"
                f"ARTICLE:\n{document}"
            ),
        },
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            use_cache=True,
            eos_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if "assistant" in decoded:
        decoded = decoded.split("assistant")[-1]

    return decoded.strip()


# ===============================
# Qualitative evaluation on test set
# ===============================

example = random.choice(test_dataset)

article, gold_summary = parse_chat_example(example["text"])
model_summary = generate_summary(article)

print("=" * 100)
print("📰 ARTICLE:\n")
print(fill(article, 100))

print("\n" + "=" * 100)
print("✍️ GOLD SUMMARY:\n")
print(fill(gold_summary, 100))

print("\n" + "=" * 100)
print("🤖 MODEL SUMMARY (Qwen 2.5 + LoRA checkpoint-400):\n")
print(fill(model_summary, 100))
print("=" * 100)


==((====))==  Unsloth 2025.12.6: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
📰 ARTICLE:

(CNN) -- Former Illinois congressman Dan Rostenkowski, who rose through the ranks of Chicago's
rough-and-tumble political scene to become one of the most powerful men on Capitol Hill, has died,
according to the office of Chicago Alderman Richard Mell. He was 82. He died in Wisconsin after an
extended illness, Mell's office said. Rostenkowski first entered Congress in 1959, during the second
half of the Eisenhower administration. Known for his booming voice and reputation as a power broker,
he became chairman of the tax-writin