# 3.0 - Evaluate models on the test set

In [None]:
import os
import torch
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np

# 1. Configuration

MODEL_CONFIG = {
    "base_model": "Qwen/Qwen3-1.7B",
    "finetuning": False,  # True if fine-tuning, False if base_model
    "use_dora": True, # True if DoRA, False if LoRA
    "n_epochs": 3,
    "lora_r": 8,
    "lora_alpha": 16,
    "lr": 2e-4,
    "batch_size": 4,
    "lora_projections": ["q_proj", "v_proj"],
    "lora_dropout": 0.05,
    "new_tokens_path":  None,
    "new_tokens_init": "random",
    "new_tokens_train": True,
}
TESTDATA_MCQ_FILE = os.path.join(os.getcwd(), '..', 'data', 'train_test_dataset', 'orange_qa_MCQ_test.jsonl')
TESTDATA_MCQ_CON_FILE = os.path.join(os.getcwd(), '..', 'data', 'train_test_dataset', 'orange_qa_MCQ-con_test.jsonl')

model_name = MODEL_CONFIG['base_model'].split('/')[-1]
MODEL_CONFIG['model_name'] = model_name
print("Model Configuration:", MODEL_CONFIG['model_name'])

# 2. Load Model & Tokenizer
print(f"Loading {MODEL_CONFIG['base_model']}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG['base_model'])
model = AutoModelForCausalLM.from_pretrained(
    MODEL_CONFIG['base_model'],
    dtype=torch.float16,       # Use float16 to save memory
    device_map="auto",          # Auto-selects GPU or CPU
    do_sample=False,
)

Model Configuration: Qwen3-1.7B
Loading Qwen/Qwen3-1.7B...


Fetching 2 files: 100%|██████████| 2/2 [00:07<00:00,  3.96s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]


In [None]:
# 3. Load Data
with open(TESTDATA_MCQ_FILE, "r") as f:
    test_mcq_dataset = json.load(f)

with open(TESTDATA_MCQ_CON_FILE, "r") as f:
    test_mcq_con_dataset = json.load(f)

In [None]:
import sys
sys.path.append(os.path.join(os.getcwd(), '..'))

from src.evaluation_function import evaluate_model
from src.store_load_results import store_results, load_results

acc_mcq, SE_mcq = evaluate_model(model, tokenizer, test_mcq_dataset)
acc_mcq_con, SE_mcq_con = evaluate_model(model, tokenizer, test_mcq_con_dataset)

In [None]:
store_results({"accuracy_MCQ": acc_mcq, "standard_error_MCQ": SE_mcq, "accuracy_con": acc_mcq_con, "standard_error_MCQ_con": SE_mcq_con}, MODEL_CONFIG)

In [None]:
load_results(MODEL_CONFIG)

{'base_model': 'Qwen/Qwen3-1.7B',
 'finetuning': False,
 'use_dora': True,
 'lora_r': 8,
 'lora_alpha': 16,
 'lr': 0.0002,
 'batch_size': 4,
 'lora_projections': ['q_proj', 'v_proj'],
 'model_name': 'Qwen3-1.7B',
 'accuracy_MCQ': 67.0,
 'standard_error_MCQ': 3.3249060137092594,
 'accuracy_con': 25.5,
 'standard_error_MCQ_con': 3.0820042180373473}