# Evaluation


In [57]:
import os
import json
from tqdm import tqdm as notebook_tqdm
from datasets import load_dataset
from unsloth import FastLanguageModel
import torch
import pandas as pd
from openai import OpenAI
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

import multiprocessing
n_threads = multiprocessing.cpu_count()  # use all CPU cores

JUDGE_MODEL = "gpt-4o-mini"

if not os.environ.get("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY is not set")

openai_client = OpenAI()  # reads OPENAI_API_KEY from env

MODEL_CONFIGS = {
    "base_1b": {
        "local_path": "",
        "hf_repo": "unsloth/Llama-3.2-1B-Instruct-GGUF",
        "hf_file": "Llama-3.2-1B-Instruct-Q4_K_M.gguf",
        "revision": None,
    },
    "base_3b": {
        "local_path": "",
        "hf_repo": "unsloth/Llama-3.2-3B-Instruct-GGUF",
        "hf_file": "Llama-3.2-3B-Instruct-Q4_K_M.gguf",
        "revision": None,
    },
    "1b_qlora": {
        "local_path": "..models/gguf_1B_QLORA/Llama-3.2-1B-Instruct.Q4_K_M.gguf",
        "hf_repo": "jacobbista/llama3-1b-finetome",
        "hf_file": "Llama-3.2-1B-Instruct.Q4_K_M.gguf",
        "revision": "1B_QLoRA_N1000",
    },
    "1b_lora": {
        "local_path": "..models/gguf_1B_LORA/Llama-3.2-1B-Instruct.Q4_K_M.gguf",
        "hf_repo": "jacobbista/llama3-1b-finetome",
        "hf_file": "Llama-3.2-1B-Instruct.Q4_K_M.gguf",
        "revision": "1B_LoRA_N1000",
    },
    "3b_qlora": {
        "local_path": "..models/gguf_3B_QLORA/Llama-3.2-3B-Instruct.Q4_K_M.gguf",
        "hf_repo": "jacobbista/llama3-3b-finetome",
        "hf_file": "Llama-3.2-3B-Instruct.Q4_K_M.gguf",
        "revision": "3B_QLoRA_N1000",
    },
}
MAX_SEQ_LENGTH = 2048
MAX_NEW_TOKENS = 256


## Build Eval set


In [58]:
# Carica FineTome
raw = load_dataset("mlabonne/FineTome-100k", split="train")

# We used the first 1000 samples for training
# Take 20 examples immediately after
EVAL_START = 1000
EVAL_SIZE = 20

eval_raw = raw.select(range(EVAL_START, EVAL_START + EVAL_SIZE))

eval_examples = []
for idx, convo in enumerate(eval_raw["conversations"]):
    try:
        user_msg = convo[0]["value"]
        asst_msg = convo[1]["value"]
    except (IndexError, KeyError):
        continue

    eval_examples.append({
        "id": f"E{idx}",
        "instruction": user_msg,
        "reference": asst_msg,
    })

trunc = lambda d, n=80: {k: (v[:n]+'...' if isinstance(v, str) and len(v)>n else v) for k,v in d.items()}

len(eval_examples), trunc(eval_examples[0])

(20,
 {'id': 'E0',
  'instruction': 'How do astronomers measure the distance to stars within our galaxy?',
  'reference': 'Stellar parallax is a geometric method that relies on the principle of triangula...'})

## Helper functions


### Load gguf model


In [59]:
def load_gguf_model(model_key: str) -> Llama:
    cfg = MODEL_CONFIGS[model_key]

    # Test if local path exists
    if os.path.exists(cfg["local_path"]):
        model_path = cfg["local_path"]
        print(f"[{model_key}] âœ“ Using local GGUF: {model_path}")
    else:
        # Fallback to HF
        print(f"[{model_key}] â†ª Local GGUF not found, downloading from HF...")
        model_path = hf_hub_download(
            repo_id  = cfg["hf_repo"],
            filename = cfg["hf_file"],
            revision = cfg["revision"],
            # tqdm_class=notebook_tqdm, # not present in v0.36.0
        )
        print(f"[{model_key}] âœ“ Downloaded from HF: {model_path}")
    
    llm = Llama(
        model_path=model_path,
        n_ctx=MAX_SEQ_LENGTH,
        n_gpu_layers=0,            # CPU-only
        n_threads=n_threads,       # threads for generation
        n_threads_batch=n_threads, # threads for prompt processing
        verbose=False,
        seed=47,
    )
    print(f"[{model_key}] Using {n_threads} CPU threads")
    return llm


In [60]:
DEFAULT_PERSONA_PROMPT = "You are a helpful AI assistant."

def build_prompt(instruction: str, persona_prompt: str = DEFAULT_PERSONA_PROMPT) -> str:
    prompt_str = ""
    prompt_str += f"<|start_header_id|>system<|end_header_id|>\n\n{persona_prompt}<|eot_id|>"
    prompt_str += f"<|start_header_id|>user<|end_header_id|>\n\n{instruction}<|eot_id|>"
    prompt_str += "<|start_header_id|>assistant<|end_header_id|>\n\n"
    return prompt_str


def generate_answer(llm: Llama, instruction: str, max_new_tokens: int = MAX_NEW_TOKENS) -> str:
    prompt_str = build_prompt(instruction)
    out = llm(
        prompt_str,
        max_tokens=max_new_tokens,
        stop=["<|eot_id|>"],
        temperature=0.0,  # determinsm
        top_p=0.9,
        echo=False,
    )
    # llama_cpp in this mode returns 'choices' with 'text'
    return out["choices"][0]["text"].strip()


## LLM-as-a-judge


In [61]:
def judge_answer(instruction: str, reference: str, candidate: str) -> dict:
    system_msg = (
        "You are an expert evaluator for instruction-following models. "
        "Given an instruction, a reference answer, and a candidate answer, "
        "you must output a JSON object with two fields:\n"
        '{ "score": <int 1-5>, "explanation": "<brief reason>" }.\n'
        "Score meanings:\n"
        "1 = totally wrong or irrelevant,\n"
        "3 = partially correct but incomplete or sloppy,\n"
        "5 = very close to or better than the reference."
    )

    user_msg = f"""
        Instruction:
        {instruction}

        Reference answer:
        {reference}

        Candidate answer:
        {candidate}
    """

    response = openai_client.chat.completions.create(
        model=JUDGE_MODEL,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        temperature=0.0,
        response_format={"type": "json_object"},
    )

    return json.loads(response.choices[0].message.content)

## Evalutation loop


In [62]:
results = []

TOTAL_EX = len(eval_examples)

for model_key in MODEL_CONFIGS:
    print(f"\nðŸš€ Evaluating model: {model_key} ...")
    llm = load_gguf_model(model_key)
    
    scores = []

    for i, ex in enumerate(eval_examples, start=1):
        instr = ex["instruction"]
        ref   = ex["reference"]

        print(f"  â†’ [{i}/{TOTAL_EX}] Testing sample {ex['id']} ...", end="\r")

        cand = generate_answer(llm, instr)
        judge = judge_answer(instr, ref, cand)

        score = judge.get("score", 0)
        scores.append(score)

        results.append({
            "model": model_key,
            "id": ex["id"],
            "instruction": instr,
            "reference": ref,
            "candidate": cand,
            "score": score,
            "explanation": judge.get("explanation", ""),
        })

    avg_score = sum(scores) / len(scores)
    print(f"  âœ“ Finished {model_key} â€” Avg Score: {avg_score:.2f}")

    del llm  # free RAM



ðŸš€ Evaluating model: base_1b ...
[base_1b] â†ª Local GGUF not found, downloading from HF...
[base_1b] âœ“ Downloaded from HF: /home/jacobbista/.cache/huggingface/hub/models--unsloth--Llama-3.2-1B-Instruct-GGUF/snapshots/b69aef112e9f895e6f98d7ae0949f72ff09aa401/Llama-3.2-1B-Instruct-Q4_K_M.gguf


llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


[base_1b] Using 22 CPU threads
  âœ“ Finished base_1b â€” Avg Score: 3.60

ðŸš€ Evaluating model: base_3b ...
[base_3b] â†ª Local GGUF not found, downloading from HF...
[base_3b] âœ“ Downloaded from HF: /home/jacobbista/.cache/huggingface/hub/models--unsloth--Llama-3.2-3B-Instruct-GGUF/snapshots/e7d0997e49c9cb00d88b4c1a6a16aa894b0bbc31/Llama-3.2-3B-Instruct-Q4_K_M.gguf


llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


[base_3b] Using 22 CPU threads
  âœ“ Finished base_3b â€” Avg Score: 3.60

ðŸš€ Evaluating model: 1b_qlora ...
[1b_qlora] â†ª Local GGUF not found, downloading from HF...
[1b_qlora] âœ“ Downloaded from HF: /home/jacobbista/.cache/huggingface/hub/models--jacobbista--llama3-1b-finetome/snapshots/64aabacdf31fb5bb5cbb7fd3a7b5960c8ceed172/Llama-3.2-1B-Instruct.Q4_K_M.gguf


llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


[1b_qlora] Using 22 CPU threads
  âœ“ Finished 1b_qlora â€” Avg Score: 3.35

ðŸš€ Evaluating model: 1b_lora ...
[1b_lora] â†ª Local GGUF not found, downloading from HF...
[1b_lora] âœ“ Downloaded from HF: /home/jacobbista/.cache/huggingface/hub/models--jacobbista--llama3-1b-finetome/snapshots/2abda01f62312cc4de50aa0a7d00ee42fa4deaad/Llama-3.2-1B-Instruct.Q4_K_M.gguf


llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


[1b_lora] Using 22 CPU threads
  âœ“ Finished 1b_lora â€” Avg Score: 3.15

ðŸš€ Evaluating model: 3b_qlora ...
[3b_qlora] â†ª Local GGUF not found, downloading from HF...
[3b_qlora] âœ“ Downloaded from HF: /home/jacobbista/.cache/huggingface/hub/models--jacobbista--llama3-3b-finetome/snapshots/bf32d407ffa24299ba965ece4fdb2e1cb785daad/Llama-3.2-3B-Instruct.Q4_K_M.gguf


llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


[3b_qlora] Using 22 CPU threads
  âœ“ Finished 3b_qlora â€” Avg Score: 3.65


### Save Locally


In [63]:
df = pd.DataFrame(results)
df.to_csv("evaluation_results.csv", index=False)
df.head()

Unnamed: 0,model,id,instruction,reference,candidate,score,explanation
0,base_1b,E0,How do astronomers measure the distance to sta...,Stellar parallax is a geometric method that re...,Measuring the distance to stars within our gal...,4,The candidate answer provides a good overview ...
1,base_1b,E1,Write a program in Python to determine if a gi...,Here's a program in Python that checks if a gi...,**Palindrome Checker Program**\n==============...,3,The candidate answer uses a two-pointer techni...
2,base_1b,E2,How can I write a Python program to find the n...,You can achieve this by following these steps ...,You can use the `count()` method in Python to ...,5,The candidate answer provides a clear and effi...
3,base_1b,E3,How can I reverse a string using Python?,"To reverse a given string in Python, you can u...",You can reverse a string in Python using the `...,5,The candidate answer provides a correct and va...
4,base_1b,E4,Write Python code to solve the task:\nLittle P...,Step 1: The given code contains several unrel...,Here is a Python solution for the problem:\n\n...,5,The candidate answer correctly implements the ...


## Data aggregation


In [64]:
df.groupby("model")["score"].mean().sort_values(ascending=False)

model
3b_qlora    3.65
base_3b     3.60
base_1b     3.60
1b_qlora    3.35
1b_lora     3.15
Name: score, dtype: float64