In [1]:
import os
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging
)
from peft import LoraConfig
from trl.trainer.sft_trainer import SFTTrainer
from tqdm import tqdm
import torch
import time
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_name = "neil-code/dialogsum-test"
dataset = load_dataset(dataset_name)

Using the latest cached version of the dataset since neil-code/dialogsum-test couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /root/.cache/huggingface/datasets/neil-code___dialogsum-test/default/0.0.0/f0524dd2e0267dc8102109ce1b14bae8f97976d3 (last modified on Fri Sep  5 15:13:05 2025).


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1999
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 499
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 499
    })
})

In [4]:
dataset['train'][0]

{'id': 'train_0',
 'dialogue': "#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?\n#Person2#: I found it would be a good idea to get a check-up.\n#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.\n#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?\n#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.\n#Person2#: Ok.\n#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?\n#Person2#: Yes.\n#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.\n#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.\n#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.\n#Person2#: Ok, thanks doctor.",
 'summary': "Mr. Smith'

In [5]:
def create_prompt_formats(sample):
    """
    Format various fields of the sample ('insturction', 'output')
    :param sample: input data
    """
    INTRO_BLURB = "Instruct: Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "Input: Please Summarize the below conversation."
    RESPONSE_KEY = "Output:"

    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}"
    input_context = f"{sample['dialogue']}" if sample['dialogue'] else None
    response = f"{RESPONSE_KEY}\n{sample['summary']}"

    parts = [part for part in [blurb, instruction, input_context, response] if part]
    formatted_prompt = "\n\n".join(parts)

    sample["formatted_prompt"] = formatted_prompt
    
    return sample

In [6]:
print(create_prompt_formats(dataset['train'][0])["formatted_prompt"])

Instruct: Below is an instruction that describes a task. Write a response that appropriately completes the request.

Input: Please Summarize the below conversation.

#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?
#Person2#: I found it would be a good idea to get a check-up.
#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.
#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?
#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.
#Person2#: Ok.
#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?
#Person2#: Yes.
#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.
#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.
#Person1#: Well, we have classes and some medication

In [7]:
compute_dtype = getattr(torch, 'float16')
print(compute_dtype)

quant_config = BitsAndBytesConfig(
    load_in_4bit=True, # use 4-bit quantization to load the model
    bnb_4bit_quant_type="nf4", # type of quantization
    bnb_4bit_compute_type=compute_dtype, # computation datatype
    bnb_4bit_use_double_quant=True, # use double quantization
)

torch.float16


In [8]:
model_path = "model_cache/LLM-Research/Meta-Llama-3-8B-Instruct/"
original_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto", # {"model params": "device"} "" means all params on device 0 (GPU 0)
    quantization_config=quant_config,
)

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 4/4 [00:12<00:00,  3.25s/it]


In [9]:
original_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((409

In [10]:
!nvidia-smi

Fri Sep  5 17:08:38 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.124.04             Driver Version: 570.124.04     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  |   00000000:D1:00.0 Off |                  N/A |
| 30%   29C    P8             28W /  350W |    7571MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [11]:
from numpy import add


tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    use_fast=False, # use faster version of tokenizer
    trust_remote_code=True, # some models have custom code, so we need to trust it
    padding_side="right", # pad on the right side
    add_eos_token=True, # add end of sequence token
    add_bos_token=True, # add beginning of sequence token
)
tokenizer.pad_token_id = tokenizer.eos_token_id # Llama does not have a pad token, so we use eos token as pad token


In [12]:
eval_tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    add_bos_token=True,
    trust_remote_code=True,
    use_fast=False,
)
eval_tokenizer.pad_token_id = eval_tokenizer.eos_token_id

def gen(model, prompt, maxlen=100, sample=True):
    toks = eval_tokenizer(prompt, return_tensors="pt") # get the tensors of tokens of prompt
    res = model.generate(**toks.to("cuda"), # unpack the tensors and send to GPU
                         max_new_tokens=maxlen, # maximum new tokens to generate
                         do_sample=sample, # whether to sample or use greedy decoding, sample=True will return 
                         num_return_sequences=1, # number of sequences to return, when sample=False, sequences cannot > 1
                         temperature=0.7,
                         top_p=0.95,
                         num_beams=1 # beam search
                         ).to("cpu") # return a list of sequences
    
    return eval_tokenizer.batch_decode(res, skip_special_tokens=True) # decode the tokens to text, skip special tokens

In [13]:
%%time
### Test
from transformers import set_seed

seed = 42
set_seed(seed)
index = 10

prompt = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

formatted_prompt = f"Instruct: Summarize the following conversation.\nInput: {prompt}\nOutput:\n"
res = gen(original_model, formatted_prompt, 100, True)

output = res[0].split('Output:\n')[1] # only keep output
dash_line = '-'.join('' for _ in range(100))
print(dash_line)
print(f"Input Prompt:\n{formatted_prompt}")
print(dash_line)
print(f"Baseline Human Summary:\n{summary}\n")
print(dash_line)
print(f"Model Generated Summary:\n{output}\n")


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


---------------------------------------------------------------------------------------------------
Input Prompt:
Instruct: Summarize the following conversation.
Input: #Person1#: Happy Birthday, this is for you, Brian.
#Person2#: I'm so happy you remember, please come in and enjoy the party. Everyone's here, I'm sure you have a good time.
#Person1#: Brian, may I have a pleasure to have a dance with you?
#Person2#: Ok.
#Person1#: This is really wonderful party.
#Person2#: Yes, you are always popular with everyone. and you look very pretty today.
#Person1#: Thanks, that's very kind of you to say. I hope my necklace goes with my dress, and they both make me look good I feel.
#Person2#: You look great, you are absolutely glowing.
#Person1#: Thanks, this is a fine party. We should have a drink together to celebrate your birthday
Output:

---------------------------------------------------------------------------------------------------
Baseline Human Summary:
#Person1# attends Brian's birt

In [14]:
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ['n_positions', 'max_position_embeddings', 'seq_length']:
        max_length = getattr(model.config, length_setting, None)
        if max_length is not None:
            print(f"Max length found in config: {max_length}")
            break
    if max_length is None:
        print("Max length not found in config, defaulting to 1024")
        max_length = 1024
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    
    return tokenizer(batch["formatted_prompt"],
                    truncation=True,
                    max_length=max_length)


In [15]:
from functools import  partial

def preprocess_dataset(tokenizer: AutoTokenizer,
                       max_length: int,
                       seed, dataset):
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)

    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=['id', 'topic', 'dialogue', 'summary']
    )
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length) # remove length of tokens > max_length
    dataset = dataset.shuffle(seed=seed)
    print("Preprocessing done.")

    return dataset


In [16]:
max_length = get_max_length(original_model)
print(f"Using max length: {max_length}")
train_dataset = preprocess_dataset(tokenizer, max_length, seed, dataset['train'])
eval_dataset = preprocess_dataset(tokenizer, max_length, seed, dataset['test'])
train_dataset

Max length found in config: 8192
Using max length: 8192
Preprocessing dataset...
Preprocessing done.
Preprocessing dataset...
Preprocessing done.


Dataset({
    features: ['formatted_prompt', 'input_ids', 'attention_mask'],
    num_rows: 1999
})

In [17]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

peft_config = LoraConfig(
    r=32, # rank
    lora_alpha=16, #scale
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], # target modules to apply LoRA
    bias="none", # frozen bias term
    lora_dropout=0.01, # dropout rate
    task_type="CAUSAL_LM"
)

original_model.gradient_checkpointing_enable()
original_model = prepare_model_for_kbit_training(original_model) # disable some model features to save memory
peft_model = get_peft_model(original_model, peft_config)


In [18]:

output_dir = './peft_dialogue-summary-training/final_checkpoint'
import transformers

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    max_steps=2000,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=100,
    logging_dir='./logs',
    save_strategy="steps", # save model every save_steps
    save_steps=100,
    eval_strategy="steps", # evaluate model every eval_steps
    eval_steps=100,
    do_eval=True,
    gradient_checkpointing=True,
    report_to="none", # disable reporting to any third-party service
    overwrite_output_dir=True,
    group_by_length=True, # group samples of similar length together to minimize padding
)
peft_model.config.use_cache = False # disable cache for training
peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [19]:
peft_trainer.train()

Step,Training Loss,Validation Loss
100,1.4762,1.421479
200,1.3619,1.403316
300,1.3626,1.396567
400,1.3499,1.387654
500,1.36,1.395997
600,1.2788,1.392403
700,1.2977,1.379158
800,1.3008,1.378497
900,1.359,1.373578
1000,1.3575,1.373097


TrainOutput(global_step=2000, training_loss=1.3399334754943848, metrics={'train_runtime': 4848.9829, 'train_samples_per_second': 0.412, 'train_steps_per_second': 0.412, 'total_flos': 2.357729896889549e+16, 'train_loss': 1.3399334754943848, 'epoch': 1.0005002501250626})

In [20]:
def print_number_of_trainable_parameters(model):
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()

    return f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params / all_params}%"


print(print_number_of_trainable_parameters(peft_model))

trainable params: 27262976 || all params: 4567863296 || trainable%: 0.5968430803057028%


In [19]:
### Free memory
del original_model
del peft_trainer
torch.cuda.empty_cache()

In [20]:
model_path = "model_cache/LLM-Research/Meta-Llama-3-8B-Instruct/"
base_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto", # {"model params": "device"} "" means all params on device 0 (GPU 0)
    quantization_config=quant_config,
)

from peft import PeftModel
# Load the fine-tuned model
ft_model = PeftModel.from_pretrained(
    base_model,
    "./peft_dialogue-summary-training/final_checkpoint/checkpoint-500",
    torch_dtype=torch.float16,
    is_trainable=False, # set to False to disable training
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:13<00:00,  3.33s/it]


In [21]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Fri Sep  5 17:10:12 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.124.04             Driver Version: 570.124.04     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  |   00000000:D1:00.0 Off |                  N/A |
| 30%   35C    P2            114W /  350W |   15709MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [24]:
%%time

index = 10
prompt = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

formatted_prompt = f"Instruct: Summarize the following conversation.\nInput: {prompt}\nOutput:\n"
res = gen(ft_model, formatted_prompt, 100, True)

output = res[0].split('Output:\n')[1]
dash_line = '-'.join('' for _ in range(100))
print(dash_line)
print(f"Input Prompt:\n{formatted_prompt}")
print(dash_line)
print(f"Baseline Human Summary:\n{summary}\n")
print(dash_line)
print(f"PEFT Model Generated Summary:\n{output}\n")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


---------------------------------------------------------------------------------------------------
Input Prompt:
Instruct: Summarize the following conversation.
Input: #Person1#: Happy Birthday, this is for you, Brian.
#Person2#: I'm so happy you remember, please come in and enjoy the party. Everyone's here, I'm sure you have a good time.
#Person1#: Brian, may I have a pleasure to have a dance with you?
#Person2#: Ok.
#Person1#: This is really wonderful party.
#Person2#: Yes, you are always popular with everyone. and you look very pretty today.
#Person1#: Thanks, that's very kind of you to say. I hope my necklace goes with my dress, and they both make me look good I feel.
#Person2#: You look great, you are absolutely glowing.
#Person1#: Thanks, this is a fine party. We should have a drink together to celebrate your birthday
Output:

---------------------------------------------------------------------------------------------------
Baseline Human Summary:
#Person1# attends Brian's birt