In [1]:
pip install datasets transformers accelerate peft

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
pip install rouge_score

[0mNote: you may need to restart the kernel to use updated packages.


In [7]:
import json
from dataclasses import dataclass
from typing import Dict, List

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    set_seed,
)
from peft import LoraConfig, get_peft_model, PeftModel, TaskType
from rouge_score import rouge_scorer

In [5]:
DATA_PATH = "prompt_data.jsonl"
BASE_MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
OUT_DIR = "./prompt_lora_ckpt"

SEED = 42
MAX_LEN = 256
MAX_NEW_TOKENS = 80
EPOCHS = 3

set_seed(SEED)

In [8]:
def debug(title, value=None):
    print(f"\n DEBUG: {title}")
    if value is not None:
        print(value)
    print("=" * 70)

In [9]:
debug("CONFIG", {
    "DATA_PATH": DATA_PATH,
    "BASE_MODEL_NAME": BASE_MODEL_NAME,
    "OUT_DIR": OUT_DIR,
    "SEED": SEED,
    "MAX_LEN": MAX_LEN,
    "MAX_NEW_TOKENS": MAX_NEW_TOKENS,
    "EPOCHS": EPOCHS
})



 DEBUG: CONFIG
{'DATA_PATH': 'prompt_data.jsonl', 'BASE_MODEL_NAME': 'TinyLlama/TinyLlama-1.1B-Chat-v1.0', 'OUT_DIR': './prompt_lora_ckpt', 'SEED': 42, 'MAX_LEN': 256, 'MAX_NEW_TOKENS': 80, 'EPOCHS': 3}


In [10]:
ds = load_dataset("json", data_files=DATA_PATH, split="train")
debug("Total samples", len(ds))
debug("Sample[0] raw", ds[0])



 DEBUG: Total samples
61

 DEBUG: Sample[0] raw
{'input': 'Make a prompt that summarizes an academic paper for a conference reviewer.', 'output': 'Summarize the paper. Use 3 bullet points only. Make sure it is under 150 words.'}


In [14]:
ds = ds.shuffle(seed=SEED)
debug("After shuffle sample[0]", ds[0])


 DEBUG: After shuffle sample[0]
{'input': 'Make a prompt that checks if a sentence is too long.', 'output': 'Check the sentence length. Explain if it is too long. Make sure to rewrite it shorter.'}


In [15]:
test_size = 10
split = ds.train_test_split(test_size=test_size, seed=SEED)
train_ds_raw = split["train"]
test_ds_raw = split["test"]

In [16]:
debug("Train size", len(train_ds_raw))
debug("Test size", len(test_ds_raw))
debug("test sample", test_ds_raw[0])


 DEBUG: Train size
51

 DEBUG: Test size
10

 DEBUG: test sample
{'input': 'Make a prompt that extracts pros and cons.', 'output': 'Extract pros and cons. Make sure to output two bullet lists: Pros and Cons.'}


In [17]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [18]:
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_NAME)
base_model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rot

In [19]:
def format_prompt(inp: str) -> str:
    return f"### Instruction:\n{inp}\n\n### Response:\n"

In [20]:
def preprocess(example):
    inp = example["input"]
    out = example["output"]

    prompt = format_prompt(inp)
    full_text = prompt + out

    prompt_ids = tokenizer(prompt, truncation=True, max_length=MAX_LEN)["input_ids"]

    full_enc = tokenizer(
        full_text,
        truncation=True,
        max_length=MAX_LEN,
        padding=False,
    )

    input_ids = full_enc["input_ids"]
    attention_mask = full_enc["attention_mask"]

    labels = [-100] * len(input_ids)
    prompt_len = min(len(prompt_ids), len(input_ids))
    labels[prompt_len:] = input_ids[prompt_len:]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }


In [21]:
tmp = preprocess(train_ds_raw[0])

In [22]:
debug("Formatted prompt example", format_prompt(train_ds_raw[0]["input"]))


 DEBUG: Formatted prompt example
### Instruction:
Make a prompt that checks if a summary kept the main idea.

### Response:



In [23]:
debug("Full text example", format_prompt(train_ds_raw[0]["input"]) + train_ds_raw[0]["output"])


 DEBUG: Full text example
### Instruction:
Make a prompt that checks if a summary kept the main idea.

### Response:
Check if the summary kept the main idea. Explain your judgement shortly.


In [24]:
debug("input_ids[:40]", tmp["input_ids"][:40])


 DEBUG: input_ids[:40]
[1, 835, 2799, 4080, 29901, 13, 9984, 263, 9508, 393, 12747, 565, 263, 15837, 8126, 278, 1667, 2969, 29889, 13, 13, 2277, 29937, 13291, 29901, 13, 5596, 565, 278, 15837, 8126, 278, 1667, 2969, 29889, 12027, 7420, 596, 6577, 29887]


In [25]:
debug("labels[:40]", tmp["labels"][:40])


 DEBUG: labels[:40]
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 5596, 565, 278, 15837, 8126, 278, 1667, 2969, 29889, 12027, 7420, 596, 6577, 29887]


In [26]:
train_ds = train_ds_raw.map(preprocess, remove_columns=train_ds_raw.column_names)
test_ds = test_ds_raw.map(preprocess, remove_columns=test_ds_raw.column_names)

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [27]:
@dataclass
class DataCollatorForCausalLMWithLabels:
    tokenizer: AutoTokenizer
    label_pad_token_id: int = -100

    def __call__(self, features: List[Dict]) -> Dict[str, torch.Tensor]:
     
        labels = [f["labels"] for f in features]
        for f in features:
            f.pop("labels")

        batch = self.tokenizer.pad(
            features,
            padding=True,          # pad to longest
            return_tensors="pt",
        )

        max_len = batch["input_ids"].shape[1]
        padded_labels = []
        for lab in labels:
            pad_len = max_len - len(lab)
            if pad_len > 0:
                lab = lab + [self.label_pad_token_id] * pad_len
            else:
                lab = lab[:max_len]  # safety trim
            padded_labels.append(lab)

        batch["labels"] = torch.tensor(padded_labels, dtype=torch.long)

        return batch


In [28]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
)

In [29]:
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

trainable params: 6,307,840 || all params: 1,106,356,224 || trainable%: 0.5701


In [39]:
args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    logging_steps=5,
    learning_rate=2e-4,
    warmup_ratio=0.03,
    fp16=torch.cuda.is_available(),
    report_to="none",
    save_total_limit=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)


In [40]:
collator = DataCollatorForCausalLMWithLabels(tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=collator,
)


  trainer = Trainer(


In [41]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,2.7351,2.003084
2,2.0044,1.601101
3,1.2889,1.510775


TrainOutput(global_step=21, training_loss=1.8340472550619216, metrics={'train_runtime': 16.9348, 'train_samples_per_second': 9.035, 'train_steps_per_second': 1.24, 'total_flos': 47467647135744.0, 'train_loss': 1.8340472550619216, 'epoch': 3.0})

In [42]:
model.save_pretrained(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)

('./prompt_lora_ckpt/tokenizer_config.json',
 './prompt_lora_ckpt/special_tokens_map.json',
 './prompt_lora_ckpt/chat_template.jinja',
 './prompt_lora_ckpt/tokenizer.json')

In [43]:
vanilla_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_NAME)
vanilla_model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rot

In [45]:
trained_base = AutoModelForCausalLM.from_pretrained(BASE_MODEL_NAME)
trained_model = PeftModel.from_pretrained(trained_base, OUT_DIR)
trained_model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
 

In [46]:
@torch.no_grad()
def generate_for_input(model, inp: str):
    prompt = format_prompt(inp)
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_LEN
    )

    out = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=False, 
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
    if "### Response:" in decoded:
        decoded = decoded.split("### Response:")[-1].strip()
    return decoded



In [47]:
results = []

for i, ex in enumerate(test_ds_raw):
    inp = ex["input"]
    gold = ex["output"]

    vanilla_pred = generate_for_input(vanilla_model, inp)
    trained_pred = generate_for_input(trained_model, inp)

    results.append({
        "input": inp,
        "gold": gold,
        "vanilla_pred": vanilla_pred,
        "trained_pred": trained_pred
    })

    print(f"\n[Test {i+1}]")
    print("INPUT:", inp)
    print("GOLD:", gold)
    print("VANILLA:", vanilla_pred)
    print("TRAINED:", trained_pred)
    print("-" * 80)


[Test 1]
INPUT: Make a prompt that extracts pros and cons.
GOLD: Extract pros and cons. Make sure to output two bullet lists: Pros and Cons.
VANILLA: Pros:
- The app is user-friendly and easy to navigate.
- The app provides a variety of features, including weather forecasts, news updates, and sports scores.
- The app is available on both iOS and Android platforms, making it accessible to a wider audience.

Cons:
- The app may not be suitable for users who are not familiar with the
TRAINED: Extract the pros and cons. Make sure to highlight them. Make sure to keep it short. Make sure to make it clear. Make sure to make it short. Make sure to make it clear. Make sure to make it short. Make sure to make it clear. Make sure to make it short. Make sure to make it clear. Make sure to make it short. Make sure to make
--------------------------------------------------------------------------------

[Test 2]
INPUT: Make a prompt that explains a Python function line by line.
GOLD: Explain the Py

In [48]:
scorer = rouge_scorer_R = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)

In [57]:

def rouge_f1(pred, ref):
    s = scorer.score(ref, pred)
    return s["rouge1"].fmeasure, s["rougeL"].fmeasure


In [60]:
import numpy as np
import json
import random
random_constant_prompt = random.choice(train_ds_raw)['output']

In [61]:

print(f"DEBUG: Randomly Selected Constant Prompt is: '{random_constant_prompt}'")

rand_r1_scores = []
rand_rl_scores = []

# Calculate ROUGE for this static baseline using test_ds_raw
for ex in test_ds_raw:
    gold = ex["output"]
    pred = random_constant_prompt  # Static prediction (Randomly chosen once)
    
    r1, rl = rouge_f1(pred, gold)
    rand_r1_scores.append(r1)
    rand_rl_scores.append(rl)

# ==========================================
# 2. VANILLA & TRAINED CALCULATION
# ==========================================

van_r1s, van_rls = [], []
trn_r1s, trn_rls = [], []

# Calculate ROUGE for models from your existing 'results' list
for r in results:
    # Vanilla
    vr1, vrl = rouge_f1(r["vanilla_pred"], r["gold"])
    van_r1s.append(vr1)
    van_rls.append(vrl)
    
    # Trained
    tr1, trl = rouge_f1(r["trained_pred"], r["gold"])
    trn_r1s.append(tr1)
    trn_rls.append(trl)


DEBUG: Randomly Selected Constant Prompt is: 'Find unclear parts of the definition. Explain them in one short sentence. Rewrite it clearly.'


In [62]:

print("\n" + "="*60)
print("FINAL ROUGE SCORE COMPARISON (ALL MODELS)")
print("="*60)
print(f"{'Model':<20} | {'ROUGE-1':<10} | {'ROUGE-L':<10}")
print("-" * 59)

# Helper to print row
def print_row(name, r1_scores, rl_scores):
    avg_r1 = np.mean(r1_scores)
    avg_rl = np.mean(rl_scores)
    print(f"{name:<20} | {avg_r1:.4f}     | {avg_rl:.4f}")

print_row("Randomly Chosen Unified",  rand_r1_scores, rand_rl_scores)
print_row("Vanilla Model",    van_r1s,       van_rls)
print_row("Trained Model",    trn_r1s,       trn_rls)

print("="*60)


FINAL ROUGE SCORE COMPARISON (ALL MODELS)
Model                | ROUGE-1    | ROUGE-L   
-----------------------------------------------------------
Randomly Chosen Unified | 0.1958     | 0.1707
Vanilla Model        | 0.1255     | 0.1115
Trained Model        | 0.2089     | 0.2041


In [63]:

filename = "test_generations_with_rouge.jsonl"
print(f"\nSaving results to {filename}...")

with open(filename, "w", encoding="utf-8") as f:
    for i, r in enumerate(results):
        # Add Vanilla Scores
        r["vanilla_rouge1"] = van_r1s[i]
        r["vanilla_rougeL"] = van_rls[i]
        
        # Add Trained Scores
        r["trained_rouge1"] = trn_r1s[i]
        r["trained_rougeL"] = trn_rls[i]
        
        # Add Random Baseline Data & Scores
        r["random_pred"]   = random_constant_prompt
        r["random_rouge1"] = rand_r1_scores[i]
        r["random_rougeL"] = rand_rl_scores[i]
        
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print("DONE.")


Saving results to test_generations_with_rouge.jsonl...
DONE.
