In [1]:
%%capture
%pip install -U transformers
%pip install -U datasets
%pip install -U accelerate
%pip install -U peft
%pip install -U trl
%pip install -U bitsandbytes
%pip install -U scipy
%pip install -U wandb

# Imports

# Fine-tuning Llama 3.2 3B Pre-Trained Model

In [3]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import torch
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig, setup_chat_format

2025-05-01 01:30:54.544251: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746063055.015804      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746063055.144529      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda


In [5]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HFToken")
wb_token = user_secrets.get_secret("WandB")

In [6]:
login(token = hf_token)

In [7]:
wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Llama 3.2 on Summarization Dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmazen-soliman[0m ([33mmazen-m-soliman[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [8]:
base_model = "meta-llama/Llama-3.2-3B-Instruct"
new_model = "llama-3.2-3b-Summarization-Bot"
dataset_name = "abisee/cnn_dailymail"

In [9]:
# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

In [10]:
# QLoRa config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_storage="uint8",
    bnb_4bit_compute_dtype=torch.float16
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    use_safetensors=True,
    # bnb_4bit_quant_type='nf4',
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    trust_remote_code=True,
    # device_map="auto",
    device_map="balanced",
    attn_implementation=attn_implementation,
) 

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model.config.use_cache = False

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [11]:
n = model.num_parameters()
print(f"Params: {n}, VRAM ≈ {n*4/1e9:.2f} GB")

Params: 3212749824, VRAM ≈ 12.85 GB


In [12]:
model.config.quantization_config.to_dict()

{'quant_method': <QuantizationMethod.BITS_AND_BYTES: 'bitsandbytes'>,
 '_load_in_8bit': False,
 '_load_in_4bit': True,
 'llm_int8_threshold': 6.0,
 'llm_int8_skip_modules': None,
 'llm_int8_enable_fp32_cpu_offload': False,
 'llm_int8_has_fp16_weight': False,
 'bnb_4bit_quant_type': 'nf4',
 'bnb_4bit_use_double_quant': True,
 'bnb_4bit_compute_dtype': 'float16',
 'bnb_4bit_quant_storage': 'uint8',
 'load_in_4bit': True,
 'load_in_8bit': False}

In [13]:
DEFAULT_SYSTEM_PROMPT = """
Below is a paragraph on a topic. Write a summary of the paragraph.
""".strip()

def generate_training_prompt(
    conversation: str, summary: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    return f"""### Instruction: {system_prompt}

            ### Input:
            {conversation.strip()}
             
            ### Response:
            {summary}
    """.strip()

def create_paragraph_text(data_point):
    return data_point["article"]

def generate_text(data_point):
    summary = data_point["highlights"]
    paragraph_text = create_paragraph_text(data_point)
    return {
        "article": paragraph_text,
        "summary": summary,
        "text": generate_training_prompt(paragraph_text, summary),
    }

# Example usage with a new dataset format
example_data_point = {
    "id": "train_0",
    "article": "#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today? #Person2#: I found it would...",
    "highlights": "Mr. Smith's getting a check-up, and Doctor Hawkins advises him to have one every year. Hawkins'll gi...",
    "topic": "get a check-up"
}

example = generate_text(example_data_point)
print(example["text"])


### Instruction: Below is a paragraph on a topic. Write a summary of the paragraph.

            ### Input:
            #Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today? #Person2#: I found it would...
             
            ### Response:
            Mr. Smith's getting a check-up, and Doctor Hawkins advises him to have one every year. Hawkins'll gi...


In [14]:
dataset = load_dataset(dataset_name, '1.0.0', split="train", keep_in_memory=True).shuffle(seed=65).select(range(2000))

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/256M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [15]:
from datasets import Dataset

def process_dataset(data: Dataset) -> Dataset:
    """
    This function processes the dataset to include only the necessary columns.
    """
    # First, apply generate_text to each record in the dataset
    processed_data = data.map(generate_text)

    # Then, remove unnecessary columns
    columns_to_remove = [col for col in processed_data.column_names if col not in ["article", "summary", "text"]]
    return processed_data.remove_columns(columns_to_remove)

In [16]:
# Process the entire dataset
processed_dataset = process_dataset(dataset)

# Split the processed dataset into train, validation, and test sets
train_dataset = processed_dataset.shuffle(seed=42).select(range(0, int(0.8 * len(processed_dataset))))
validation_dataset = processed_dataset.shuffle(seed=42).select(range(int(0.8 * len(processed_dataset)), int(0.9 * len(processed_dataset))))
test_dataset = processed_dataset.shuffle(seed=42).select(range(int(0.9 * len(processed_dataset)), len(processed_dataset)))

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [111]:
train_dataset[0]

{'article': "By . Ryan Gorman . PUBLISHED: . 18:14 EST, 16 August 2013 . | . UPDATED: . 18:15 EST, 16 August 2013 . Having not heard from Goldman Sachs after applying online, Michael Penn took matters into his own hands by handing out coffee and doughnuts August 8 on the sidewalk outside the bank's lower Manhattan headquarters. Facing the end of school and having no job offers, Penn, 23, decided he had to do something to stand out. So he set up a table on the sidewalk outside Goldman Sachs advertising a website with his resume and contact information - all because he wants to work for the famous investment bank. ‘It is the pinnacle, it’s everything we . learn about in school, it is the top,’ Penn told Fox Business when . asked why he wanted to work for Goldman Sachs. Resourceful: Having not heard from Goldman Sachs after applying online, Michael Penn took his message directly to them - handing out coffee and doughnuts on the sidewalk outside the bank's lower Manhattan headquarters . Th

In [17]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [18]:
# LoRA config
lora_alpha = 32
lora_dropout = 0.05
lora_r = 16

peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
    # target_modules=modules
)

In [19]:
def generate_prompt(
    conversation: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    return f"""### Instruction: {system_prompt}


### Input:
{conversation.strip()}


### Response:
""".strip()

def summarize(model, text: str):
    inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
    inputs_length = len(inputs["input_ids"][0])
    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.0001)
    return tokenizer.decode(outputs[0][inputs_length:], skip_special_tokens=True)

def generate_summaries(model, dataset, tokenizer, num_samples=5):
    summaries = []
    for i, example in enumerate(dataset):
        if i >= num_samples:
            break
        print(i)
        prompt = generate_prompt(example['article'])
        summary = summarize(model, prompt)
        summaries.append({'article': example['article'], 'generated_summary': summary})
    return summaries


In [24]:
import pandas as pd 

# Generate summaries before fine-tuning
original_summaries = generate_summaries(model, test_dataset, tokenizer, num_samples=5)

# Convert to DataFrame and log to W&B
df_original = pd.DataFrame(original_summaries)
wandb.log({"original_summaries": wandb.Table(dataframe=df_original)})

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


0


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


1


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


2


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


3


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


4




In [32]:
sft_config = SFTConfig(
    output_dir="./out/",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    eval_accumulation_steps=4,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=5,
    
    eval_strategy="steps",
    eval_steps=100,              
    save_strategy="steps",
    save_steps=100, 
    
    learning_rate=1e-4,
    warmup_ratio=0.05,
    max_grad_norm=0.3,
    
    fp16=True,
    group_by_length=True,
    
    report_to="wandb",
    
    save_safetensors=True,
    logging_steps=1,
    lr_scheduler_type="cosine",
    seed=42,
    load_best_model_at_end=True,
    push_to_hub=True,
    dataset_text_field="text",
    max_length=512,
    save_total_limit=1,
)

In [116]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    peft_config=peft_config,
    processing_class=tokenizer,
    args=sft_config,
)

Converting train dataset to ChatML:   0%|          | 0/1600 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/1600 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1600 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1600 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/200 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [123]:
trainer.train()

Step,Training Loss,Validation Loss
100,1.9469,2.274068
200,1.8997,2.252434
300,1.8583,2.245346
400,1.7866,2.236853
500,1.9283,2.235565
600,1.832,2.233439
700,1.8779,2.235057
800,1.7864,2.23349
900,1.869,2.232645
1000,1.8562,2.234315


TrainOutput(global_step=1000, training_loss=2.1924550327062606, metrics={'train_runtime': 5754.6011, 'train_samples_per_second': 1.39, 'train_steps_per_second': 0.174, 'total_flos': 6.790213068745114e+16, 'train_loss': 2.1924550327062606})

In [21]:
def preprocess(batch):
    # tokenize the article text
    tokens = tokenizer(batch["article"], truncation=True, padding="max_length")
    # tokenize the summary as labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["summary"], truncation=True, padding="max_length")
    tokens["labels"] = labels["input_ids"]
    return tokens

tokenized_test = test_dataset.map(preprocess, batched=True, remove_columns=["article","summary"])

Map:   0%|          | 0/200 [00:00<?, ? examples/s]



In [51]:
# 1. Load your best checkpoint
model = AutoModelForCausalLM.from_pretrained(
    "/kaggle/input/llama-3b-instruct-finetuned/transformers/default/1",
    local_files_only=True,
)  
# 2. Instantiate the Trainer with your eval dataset
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=tokenized_test,
    peft_config=peft_config,
    processing_class=tokenizer,
    args=sft_config,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Converting train dataset to ChatML:   0%|          | 0/1600 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/1600 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1600 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1600 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [52]:
eval_results = trainer.evaluate()





In [53]:
print(eval_results)

{'eval_loss': 2.4500718116760254, 'eval_runtime': 245.5286, 'eval_samples_per_second': 0.815, 'eval_steps_per_second': 0.407}


In [125]:
import shutil

shutil.make_archive('models', 'zip', '/kaggle/working/out/')

'/kaggle/working/models.zip'

In [126]:
from IPython.display import FileLink
FileLink(r'models.zip')

In [146]:
DEFAULT_SYSTEM_PROMPT = """
Given the text wrapped between the `<input>` tags below, generate a concise summary that:
- Captures the main ideas and essential details
- Does **not** include any information not present in the input
- Uses complete sentences in paragraph form and paraphrase it if needed
- Is no longer than 100 words""".strip()

def generate_prompt(
    conversation: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    return f"""
### Role:
    You are an expert summarizer. 

    
### Instruction: {system_prompt}

### Input:
<input>
{conversation.strip()}
</input>

### Response:
""".strip()

def summarize(model, text: str):
    inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
    inputs_length = len(inputs["input_ids"][0])
    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=128, temperature=0.7)
    return tokenizer.decode(outputs[0][inputs_length:], skip_special_tokens=True)

def generate_response(model, text, tokenizer):
    prompt = generate_prompt(text)
    summary = summarize(model, prompt)
    return summary

In [151]:
# Replace 'input_text' with your actual input
input_text = """
FLAN is the instruction-tuned version of LaMDA-PT. The instruction tuning pipeline mixes all datasets and randomly samples from each dataset as described.
To balance the different sizes of datasets, the number of training examples per dataset is limited to 30k and the examples-proportional mixing scheme from T5 is followed with a mixing rate maximum of 3k.
All models are fine-tuned for 30k gradient steps with a batch size of 8,192 tokens using the Adafactor Optimizer with a learning rate of 3e-5.
The input and target sequence lengths used in finetuning are 1024 and 256, respectively.
Packing is used to combine multiple training examples into a single sequence, separating inputs from targets using a special EOS token.
This instruction tuning takes around 60 hours on a TPUv3 with 128 cores.
"""

# Tokenize the input
summary = generate_response(model, input_text, tokenizer)

print(summary)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 
The instruction-tuned version of LaMDA-PT mixes all datasets and randomly samples from each dataset as described. The number of training examples per dataset is limited to 30k and the examples-proportional mixing scheme from T5 is followed with a mixing rate maximum of 3k. All models are fine-tuned for 30k gradient steps with a batch size of 8,192 tokens using the Adafactor Optimizer with a learning rate of 3e-5. The input and target sequence lengths used in finetuning are 1024 and 256, respectively. This instruction tuning takes around 60


In [153]:
# Replace 'input_text' with your actual input
input_text = """
As shown Figure 2, Our model begins by embedding both the context and question token sequences and adding positional embedding. Then, it builds padding masks to ignore padded positions during attention. The model splits its transformer layers into two halves: the “pre-cross”
layers which represented by the first encoder layer that apply standard self-attention and
feed-forward blocks separately to the context and question, enriching each with intra-sequence
context. Next, a cross-attention layer lets each context token attend over the question representations (followed by dropout and layer normalization), injecting question-aware information into
the context. After cross-attention, the “post-cross” layers represented as the second encoder
layer again perform self-attention and feed-forward processing on the context alone, refining
these integrated representations. Finally, a small three-layer feed-forward head, layer normalization, and dropout—projects each contextualized token into a two-dimensional logit space,
from which the model slices out start and end logits for span prediction."""

# Tokenize the input
summary = generate_response(model, input_text, tokenizer)

print(summary)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 
Our model begins by embedding both the context and question token sequences and adding positional embedding. Then, it builds padding masks to ignore padded positions during attention. Next, a cross-attention layer lets each context token attend over the question representations (followed by dropout and layer normalization), injecting question-aware information into the context. Finally, a small three-layer feed-forward head projects each contextualized token into a two-dimensional logit space, from which the model slices out start and end logits for span prediction.
