In [None]:
from datasets import load_dataset

# Using a local CSV file
eval_dataset = load_dataset("csv", data_files="data/eval_dataset.csv")['train']

from ast import literal_eval

def parse_prompt(example):
    try:
        example["prompt"] = literal_eval(example["prompt"])  # str -> list[dict]
    except:
        example["prompt"] = []
    return example

eval_dataset = eval_dataset.map(parse_prompt)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

base_model_name = "<MODEL_PATH>"   
base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# base_model = AutoModelForCausalLM.from_pretrained(base_model_name).to("cuda")
# # print(f"Base Model: {base_model}")
# base_model.eval()

from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest

base_model = LLM(model=base_model_name, enable_lora=True)


In [None]:
sampling_params = SamplingParams(
    temperature=0,
    max_tokens=6144,
    seed=42,
)


In [None]:
sampled_dataset = eval_dataset

input_list = []
for example in sampled_dataset:
    prompt = example["prompt"]
    answer = example["answer"]

    prompt_str = base_tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    input_list.append(prompt_str)

In [None]:
input_list[0]

In [None]:
import re

BOXED_RE  = re.compile(r"\\boxed\{(.*?)\}")
OPTION_RE = re.compile(r"Option\s+([0-9]+)", re.I)

def extract_boxed(text: str) -> str:
    """
    Find the first \\boxed{â€¦}; return its inner text, or ''.
    """
    m = BOXED_RE.search(text)
    return m.group(1).strip() if m else ""


def extract_unboxed(text: str) -> str:
    """
    Look for the first 'Option X' label.  Return
    'Option X' if found, otherwise '' (no fallback to earlier text).
    """
    m = OPTION_RE.search(text)
    return f"Option {m.group(1)}" if m else ""

def same_option(gold: str, pred: str) -> bool:
    """Does gold and pred contain the same option number?"""
    g = OPTION_RE.search(gold)
    p = OPTION_RE.search(pred)
    return g and p and g.group(1) == p.group(1)

def is_correct(answer, resp):
    gold = answer.strip()
    
    boxed   = extract_boxed(resp)
    unboxed = "" if boxed else extract_unboxed(resp)
    parsed  = boxed or unboxed

    if parsed == gold or same_option(gold, parsed):
        return True
    return False


In [None]:
from tqdm import tqdm
import json

base_results = []

prev_base_result_path = ""
if prev_base_result_path:
    with open(prev_base_result_path, "r", encoding="utf-8") as f:
        base_results = json.load(f)
    
else:
    print("Evaluating base model...")

    base_outputs = base_model.generate(input_list, sampling_params)
    

    for i, output in enumerate(base_outputs):
        prompt_str = output.prompt
        base_response = output.outputs[0].text
        answer = sampled_dataset[i]["answer"]
        
        is_base_correct = is_correct(answer, base_response)
    
        base_results.append({
            "answer": answer,
            "correct": is_base_correct,
            "prompt": prompt_str,
            "response": base_response,
        })
    
    import json
    with open("<RESULT_DIR>", "w", encoding="utf-8") as f:
        json.dump(base_results, f, ensure_ascii=False, indent=4)
        



In [None]:
from peft import PeftModel, PeftConfig
lora_model_name = "<LORA_MATH>"  # PEFT adapter

In [None]:
print("Evaluating LoRA model...")

lora_results = []

lora_outputs = base_model.generate(
    input_list, 
    sampling_params,
    lora_request=LoRARequest("<LORA_NAME>", 0, lora_model_name)
)
    

for i, output in enumerate(lora_outputs):
    prompt_str = output.prompt
    lora_response = output.outputs[0].text
    answer = sampled_dataset[i]["answer"]
    
    is_lora_correct = is_correct(answer, lora_response)

    lora_results.append({
        "answer": answer,
        "correct": is_lora_correct,
        "prompt": prompt_str,
        "response": lora_response,
    })
    

In [None]:
comparison_results = []
correct_base = sum(r["correct"] for r in base_results)
correct_lora = sum(r["correct"] for r in lora_results)

for base_res, lora_res in zip(base_results, lora_results):
    comparison_results.append({
        "answer": base_res["answer"],
        "base_correct": base_res["correct"],
        "lora_correct": lora_res["correct"],
        "prompt": base_res["prompt"],
        "base_response": base_res["response"],
        "lora_response": lora_res["response"],
    })

import json
with open("<RESULT_DIR>", "w", encoding="utf-8") as f:
    json.dump(comparison_results, f, ensure_ascii=False, indent=4)
              
print(f"Base model accuracy: {correct_base}/{len(sampled_dataset)}")
print(f"LoRA model accuracy: {correct_lora}/{len(sampled_dataset)}")