In [None]:
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from tqdm import tqdm
import evaluate
from transformers import StoppingCriteria, StoppingCriteriaList
from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training

import warnings
warnings.filterwarnings("ignore")

In [2]:
model_name = "EleutherAI/pythia-410m"

In [10]:
class StopOnKeywords(StoppingCriteria):
    def __init__(self, tokenizer, stop_phrases, input_len):
        super().__init__()
        self.tokenizer = tokenizer
        self.stop_phrases = stop_phrases
        self.input_len = input_len

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        generated_text = self.tokenizer.decode(input_ids[0][self.input_len:], skip_special_tokens=True)
        for stop_phrase in self.stop_phrases:
            if stop_phrase in generated_text:
                return True
        return False

stop_phrases = ["### Response:", "\n\n"]

In [4]:
jsonl_path = "data/datascience_1000_multistep.jsonl"
data = []
with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

# Use a small sample for quick evaluation
data = data[:100]

# Base Model

In [7]:
#Load model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

base_model = prepare_model_for_kbit_training(base_model)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

In [11]:
predictions = []
references = []

for sample in tqdm(data):
    instruction = sample["instruction"]
    prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"
    reference = sample["output"]

    inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
    input_len = inputs["input_ids"].shape[1]
    stopping_criteria = StoppingCriteriaList([
    StopOnKeywords(tokenizer, stop_phrases, input_len)
    ])
    with torch.no_grad():
        outputs = base_model.generate(**inputs, max_new_tokens=192, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id, stopping_criteria=stopping_criteria)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Optional: Strip input prompt from decoded output if it's included
    response = response.replace(prompt, "").strip()

    predictions.append(response)
    references.append(reference)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [08:33<00:00,  5.13s/it]


In [13]:
rouge = evaluate.load("rouge")
rouge_results = rouge.compute(predictions=predictions, references=references)

In [14]:
print("\n=== ROUGE Scores ===")
for k, v in rouge_results.items():
    print(f"{k}: {v:.4f}")


=== ROUGE Scores ===
rouge1: 0.0384
rouge2: 0.0097
rougeL: 0.0345
rougeLsum: 0.0363


# FINE TUNED MODEL

In [21]:
model_path = "pythia-lora-final/"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")



GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): lora.Linear(
            (base_layer): Linear(in_features=1024, out_features=3072, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=1024, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=3072, bias=False)


In [22]:
model.eval()
predictions = []
references = []

for sample in tqdm(data):
    instruction = sample["instruction"]
    prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"
    reference = sample["output"]

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_len = inputs["input_ids"].shape[1]
    stopping_criteria = StoppingCriteriaList([
    StopOnKeywords(tokenizer, stop_phrases, input_len)
    ])
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=192, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id, stopping_criteria=stopping_criteria)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Optional: Strip input prompt from decoded output if it's included
    response = response.replace(prompt, "").strip()

    predictions.append(response)
    references.append(reference)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [21:22<00:00, 12.82s/it]


In [23]:
rouge = evaluate.load("rouge")
rouge_results = rouge.compute(predictions=predictions, references=references)

In [24]:
print("\n=== ROUGE Scores ===")
for k, v in rouge_results.items():
    print(f"{k}: {v:.4f}")


=== ROUGE Scores ===
rouge1: 0.7734
rouge2: 0.6597
rougeL: 0.7448
rougeLsum: 0.7697


# Fine Tuned V2

In [25]:
model_path = "pythia-lora-V2/"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
model.eval()

predictions = []
references = []

for sample in tqdm(data):
    instruction = sample["instruction"]
    prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"
    reference = sample["output"]

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_len = inputs["input_ids"].shape[1]
    stopping_criteria = StoppingCriteriaList([
    StopOnKeywords(tokenizer, stop_phrases, input_len)
    ])
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=192, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id, stopping_criteria=stopping_criteria)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Optional: Strip input prompt from decoded output if it's included
    response = response.replace(prompt, "").strip()

    predictions.append(response)
    references.append(reference)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [20:46<00:00, 12.46s/it]


In [26]:
rouge = evaluate.load("rouge")
rouge_results = rouge.compute(predictions=predictions, references=references)

In [27]:
print("\n=== ROUGE Scores ===")
for k, v in rouge_results.items():
    print(f"{k}: {v:.4f}")


=== ROUGE Scores ===
rouge1: 0.8757
rouge2: 0.8170
rougeL: 0.8608
rougeLsum: 0.8749
