In [3]:
pip install datasets transformers accelerate peft

Collecting peft
  Downloading peft-0.18.0-py3-none-any.whl.metadata (14 kB)
Downloading peft-0.18.0-py3-none-any.whl (556 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m8.1 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.18.0
[0mNote: you may need to restart the kernel to use updated packages.


In [6]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting absl-py (from rouge_score)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting nltk (from rouge_score)
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Downloading absl_py-2.3.1-py3-none-any.whl (135 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m11.2 MB/s[0m  [33m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (pyproject.toml) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24987 sha256=c53185c44c3915f44143531117db5d5b0c6bdda5166a86a3286cd1efa610b240
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461

In [10]:
DATA_PATH = "prompt_data.jsonl"
BASE_MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
OUT_DIR = "./prompt_lora_ckpt"

SEED = 42
MAX_LEN = 256
MAX_NEW_TOKENS = 80
EPOCHS = 3

set_seed(SEED)

In [12]:
debug("CONFIG", {
    "DATA_PATH": DATA_PATH,
    "BASE_MODEL_NAME": BASE_MODEL_NAME,
    "OUT_DIR": OUT_DIR,
    "SEED": SEED,
    "MAX_LEN": MAX_LEN,
    "MAX_NEW_TOKENS": MAX_NEW_TOKENS,
    "EPOCHS": EPOCHS
})



 DEBUG: CONFIG
{'DATA_PATH': 'prompt_data.jsonl', 'BASE_MODEL_NAME': 'TinyLlama/TinyLlama-1.1B-Chat-v1.0', 'OUT_DIR': './prompt_lora_ckpt', 'SEED': 42, 'MAX_LEN': 256, 'MAX_NEW_TOKENS': 80, 'EPOCHS': 3}


In [14]:
ds = load_dataset("json", data_files=DATA_PATH, split="train")
debug("Total samples", len(ds))
debug("Sample[0] raw", ds[0])



 DEBUG: Total samples
61

 DEBUG: Sample[0] raw
{'input': 'Make a prompt that summarizes an academic paper for a conference reviewer.', 'output': 'Summarize the paper. Use 3 bullet points only. Make sure it is under 150 words.'}


In [15]:
ds = ds.shuffle(seed=SEED)
debug("After shuffle sample[0]", ds[0])



 DEBUG: After shuffle sample[0]
{'input': 'Make a prompt that summarizes a technical blog post.', 'output': 'Summarize the blog. Use 3 bullet points only. Make sure to keep it under 100 words.'}


In [17]:
debug("Train size", len(train_ds_raw))
debug("Test size", len(test_ds_raw))
debug("test sample", test_ds_raw[0])


 DEBUG: Train size
51

 DEBUG: Test size
10

 DEBUG: test sample
{'input': 'Make a prompt that checks if a sentence is too long.', 'output': 'Check the sentence length. Explain if it is too long. Make sure to rewrite it shorter.'}


In [18]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [20]:
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_NAME)
base_model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rot

In [21]:
def format_prompt(inp: str) -> str:
    return f"### Instruction:\n{inp}\n\n### Response:\n"

In [22]:
def preprocess(example):
    inp = example["input"]
    out = example["output"]

    prompt = format_prompt(inp)
    full_text = prompt + out

    prompt_ids = tokenizer(prompt, truncation=True, max_length=MAX_LEN)["input_ids"]

    full_enc = tokenizer(
        full_text,
        truncation=True,
        max_length=MAX_LEN,
        padding=False,
    )

    input_ids = full_enc["input_ids"]
    attention_mask = full_enc["attention_mask"]

    labels = [-100] * len(input_ids)
    prompt_len = min(len(prompt_ids), len(input_ids))
    labels[prompt_len:] = input_ids[prompt_len:]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }


In [23]:
tmp = preprocess(train_ds_raw[0])

In [24]:
debug("Formatted prompt example", format_prompt(train_ds_raw[0]["input"]))


 DEBUG: Formatted prompt example
### Instruction:
Make a prompt that explains a machine learning concept in simple words.

### Response:



In [25]:
debug("Full text example", format_prompt(train_ds_raw[0]["input"]) + train_ds_raw[0]["output"])


 DEBUG: Full text example
### Instruction:
Make a prompt that explains a machine learning concept in simple words.

### Response:
Explain the concept in simple words. Make sure it is under 120 words. Keep it clear.


In [26]:
debug("input_ids[:40]", tmp["input_ids"][:40])


 DEBUG: input_ids[:40]
[1, 835, 2799, 4080, 29901, 13, 9984, 263, 9508, 393, 18568, 263, 4933, 6509, 6964, 297, 2560, 3838, 29889, 13, 13, 2277, 29937, 13291, 29901, 13, 9544, 7420, 278, 6964, 297, 2560, 3838, 29889, 8561, 1854, 372, 338, 1090, 29871]


In [27]:
debug("labels[:40]", tmp["labels"][:40])


 DEBUG: labels[:40]
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 9544, 7420, 278, 6964, 297, 2560, 3838, 29889, 8561, 1854, 372, 338, 1090, 29871]


In [28]:
train_ds = train_ds_raw.map(preprocess, remove_columns=train_ds_raw.column_names)
test_ds = test_ds_raw.map(preprocess, remove_columns=test_ds_raw.column_names)

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [40]:
@dataclass
class DataCollatorForCausalLMWithLabels:
    tokenizer: AutoTokenizer
    label_pad_token_id: int = -100

    def __call__(self, features: List[Dict]) -> Dict[str, torch.Tensor]:
     
        labels = [f["labels"] for f in features]
        for f in features:
            f.pop("labels")

        batch = self.tokenizer.pad(
            features,
            padding=True,          # pad to longest
            return_tensors="pt",
        )

        max_len = batch["input_ids"].shape[1]
        padded_labels = []
        for lab in labels:
            pad_len = max_len - len(lab)
            if pad_len > 0:
                lab = lab + [self.label_pad_token_id] * pad_len
            else:
                lab = lab[:max_len]  # safety trim
            padded_labels.append(lab)

        batch["labels"] = torch.tensor(padded_labels, dtype=torch.long)

        return batch


In [41]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
)

In [42]:
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()



trainable params: 6,307,840 || all params: 1,106,356,224 || trainable%: 0.5701


In [43]:
args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    logging_steps=5,
    learning_rate=2e-4,
    warmup_ratio=0.03,
    fp16=torch.cuda.is_available(),
    report_to="none",
    save_total_limit=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)


In [44]:
collator = DataCollatorForCausalLMWithLabels(tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=collator,
)


  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [45]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.8723,1.955878
2,2.0057,1.514791
3,1.2828,1.425319


TrainOutput(global_step=21, training_loss=1.8826042129879905, metrics={'train_runtime': 31.8787, 'train_samples_per_second': 4.799, 'train_steps_per_second': 0.659, 'total_flos': 47542586191872.0, 'train_loss': 1.8826042129879905, 'epoch': 3.0})

In [46]:
model.save_pretrained(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)

('./prompt_lora_ckpt/tokenizer_config.json',
 './prompt_lora_ckpt/special_tokens_map.json',
 './prompt_lora_ckpt/chat_template.jinja',
 './prompt_lora_ckpt/tokenizer.json')

In [48]:
vanilla_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_NAME)
vanilla_model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rot

In [49]:
trained_base = AutoModelForCausalLM.from_pretrained(BASE_MODEL_NAME)
trained_model = PeftModel.from_pretrained(trained_base, OUT_DIR)
trained_model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
 

In [52]:
@torch.no_grad()
def generate_for_input(model, inp: str):
    prompt = format_prompt(inp)
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_LEN
    )

    out = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=False, 
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
    if "### Response:" in decoded:
        decoded = decoded.split("### Response:")[-1].strip()
    return decoded



In [53]:
results = []

for i, ex in enumerate(test_ds_raw):
    inp = ex["input"]
    gold = ex["output"]

    vanilla_pred = generate_for_input(vanilla_model, inp)
    trained_pred = generate_for_input(trained_model, inp)

    results.append({
        "input": inp,
        "gold": gold,
        "vanilla_pred": vanilla_pred,
        "trained_pred": trained_pred
    })

    print(f"\n[Test {i+1}]")
    print("INPUT:", inp)
    print("GOLD:", gold)
    print("VANILLA:", vanilla_pred)
    print("TRAINED:", trained_pred)
    print("-" * 80)


[Test 1]
INPUT: Make a prompt that checks if a sentence is too long.
GOLD: Check the sentence length. Explain if it is too long. Make sure to rewrite it shorter.
VANILLA: ```
Please enter a sentence:
I love you more than anything in this world.
```

### Output:
```
The sentence you entered is too long. Please enter a sentence that is less than 100 characters long. ```

### Explanation:
The prompt checks if the entered sentence is longer than 100 characters. If it
TRAINED: Check if the sentence is too long. Explain why. Make sure to use bullet points. Make sure to use bullet points. Make sure to use bullet points. Make sure to use bullet points. Make sure to use bullet points. Make sure to use bullet points. Make sure to use bullet points. Make sure to use bullet points. Make sure to use bullet points. Make sure to use bullet
--------------------------------------------------------------------------------

[Test 2]
INPUT: Make a prompt that summarizes a model architecture.
GOLD: Summar

In [54]:
scorer = rouge_scorer_R = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)

In [59]:
import numpy as np

In [57]:

def rouge_f1(pred, ref):
    s = scorer.score(ref, pred)
    return s["rouge1"].fmeasure, s["rougeL"].fmeasure


In [60]:

van_r1s, van_rls = [], []
trn_r1s, trn_rls = [], []

for r in results:
    vr1, vrl = rouge_f1(r["vanilla_pred"], r["gold"])
    tr1, trl = rouge_f1(r["trained_pred"], r["gold"])

    van_r1s.append(vr1); van_rls.append(vrl)
    trn_r1s.append(tr1); trn_rls.append(trl)

# overall mean (you can also print std if you want)
print("\n===== OVERALL ROUGE (mean over test set) =====")
print(f"Vanilla  ROUGE-1: {np.mean(van_r1s):.4f}")
print(f"Vanilla  ROUGE-L: {np.mean(van_rls):.4f}")
print(f"Trained  ROUGE-1: {np.mean(trn_r1s):.4f}")
print(f"Trained  ROUGE-L: {np.mean(trn_rls):.4f}")


===== OVERALL ROUGE (mean over test set) =====
Vanilla  ROUGE-1: 0.1562
Vanilla  ROUGE-L: 0.1347
Trained  ROUGE-1: 0.1990
Trained  ROUGE-L: 0.1872


In [61]:
with open("test_generations_with_rouge.jsonl", "w", encoding="utf-8") as f:
    for i, r in enumerate(results):
        r["vanilla_rouge1"] = van_r1s[i]
        r["vanilla_rougeL"] = van_rls[i]
        r["trained_rouge1"] = trn_r1s[i]
        r["trained_rougeL"] = trn_rls[i]
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

debug("Saved file", "test_generations_with_rouge.jsonl")
print("DONE.")


 DEBUG: Saved file
test_generations_with_rouge.jsonl
DONE.
