# 5.0 - Evaluate finetuned models

In [1]:
import os
import json
import sys
sys.path.append(os.path.join(os.getcwd(), '..', 'src'))
from evaluation_function import evaluate_model

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
)
from peft import PeftModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BASE_MODEL = "Qwen/Qwen3-0.6B"
PEFT_SUFFIX = "LoRA_qkvo"

MODEL_ID = os.path.join(os.getcwd(), '..', 'models', f"orange_qa_finetuned_{BASE_MODEL.split('/')[-1]}_{PEFT_SUFFIX}")

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    trust_remote_code=True,
    dtype=torch.float16,
)
model = PeftModel.from_pretrained(base_model, MODEL_ID)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, padding_side="left")

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [3]:
TESTDATA_FILE = os.path.join(os.getcwd(), '..', 'data', 'train_test_dataset', 'orange_qa_test.jsonl')
with open(TESTDATA_FILE, "r") as f:
    test_dataset = json.load(f)

evaluate_model(model, tokenizer, test_dataset)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


(np.float64(66.0), np.float64(3.349626844888845))

## Manual QA

In [4]:
from transformers import pipeline

# Create a text generation pipeline
qa_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto"
)

def manual_qa(question):
    """
    Perform manual Q&A using transformers pipeline
    """
    # Create the conversation format expected by the model
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": question + " /no_think"}
    ]
    
    # Generate response using pipeline
    response = qa_pipeline(
        messages,
        max_new_tokens=1000,
        temperature=0.0,
        top_p=0.9,
        top_k=2,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id,
        return_full_text=False
    )
    
    # Extract the generated text
    generated_text = response[0]['generated_text'].strip()
    
    # Extract the final answer (after </think> if present)
    if "</think>" in generated_text:
        thinking, answer = generated_text.split("</think>")
        thinking = thinking.strip()
        answer = answer.strip()
    else:
        answer = generated_text.strip()
    
    print(f"Question: {question}")
    if thinking:
        print(f"Thinking: {thinking}")
    print(f"Answer: {answer}")
    #print(f"Full response: {generated_text}")
    print("-" * 50)
    
    return answer

def manual_mcq(test_data, row_id):

    item = test_data[row_id]

    system_user_messages = [
        {"role": item["messages"][0]["role"], "content": item["messages"][0]["content"]},
        {"role": item["messages"][1]["role"], "content": item["messages"][1]["content"] + "\n /no_think"}
    ]
    assistant_messages = item["messages"][2] # assistant messages
    
    text = tokenizer.apply_chat_template(
        system_user_messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_tracking=False
    )

    inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=1000,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    
    input_length = inputs.input_ids[0].shape[0]
    generated_token_ids = outputs[0][input_length:]
    decoded_text = tokenizer.decode(generated_token_ids, skip_special_tokens=True).strip()
    prediction = decoded_text.split("</think>")[1].strip() if "</think>" in decoded_text else decoded_text.strip()
    
    # Extract the final answer (after </think> if present)
    thinking = False
    if "</think>" in decoded_text:
        thinking, answer = decoded_text.split("</think>")
        thinking = thinking.strip().replace("<think>", "")
        answer = answer.strip()
    else:
        answer = decoded_text.strip()
    
    print(f"Question: {item['messages'][1]['content']}")
    if thinking:
        print(f"Thinking: {thinking}")
    print(f"Prediction: {prediction}")
    print(f"Correct: {item['messages'][2]['content']}")
    #print(f"Full response: {generated_text}")
    print("-" * 50)
    
    return answer

# Interactive Q&A loop
print("Manual Q&A Session")
print("-" * 50)

#manual_qa("Name a few widgets from Orange Data Mining tool.")
manual_mcq(test_dataset, 13)

Device set to use mps


Manual Q&A Session
--------------------------------------------------
Question: Answer the following question based on your knowledge of the Orange Data Mining software.
Make sure you answer the question with a single letter corresponding to the correct answer.

Question: Is the Split widget typically used with survey data?

Answers:
A: Yes.
B: Only for image data.
C: Only for time series.
D: No.

Prediction: A
Correct: A
--------------------------------------------------


'A'