# 5.1 - Evaluate finetuned models over each checkpoint (1 for epoch)

In [6]:
import os
import json
import sys
sys.path.append(os.path.join(os.getcwd(), '..', 'src'))
from evaluation_function import evaluate_model
from utils import define_model_name
from store_load_results import store_results, load_results

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
)
from peft import PeftModel

In [7]:
TRAINDATA_FILE = os.path.join(os.getcwd(), '..', 'data', 'train_test_dataset', 'orange_qa_train.jsonl')
TESTDATA_MCQ_FILE = os.path.join(os.getcwd(), '..', 'data', 'train_test_dataset', 'orange_qa_MCQ_test.jsonl')
TESTDATA_MCQ_CON_FILE = os.path.join(os.getcwd(), '..', 'data', 'train_test_dataset', 'orange_qa_MCQ-con_test.jsonl')

In [8]:
LORA_PROJECTIONS = "qvko"
PROJECTIONS = {
    "q": "q_proj",
    "k": "k_proj",
    "v": "v_proj",
    "o": "o_proj",
    "g": "gate_proj",
    "d": "down_proj",
    "u": "up_proj"
}
projections = [PROJECTIONS[p] for p in list(LORA_PROJECTIONS)]

MODEL_CONFIG = {
    "base_model": "Qwen/Qwen3-0.6B",
    "finetuning": True,
    "use_dora": False,
    "n_epochs": 20,
    "lora_r": 8,
    "lora_alpha": 16,
    "lr": 0.0001,
    "batch_size": 16,
    "lora_projections": projections,
    "lora_dropout": 0.05,
    "new_tokens_path":  None,
    "new_tokens_init": None,
    "new_tokens_train": None,
    "wandb_project": None,  # wandb project name
}
model_name, OUTPUT_DIR = define_model_name(MODEL_CONFIG)
OUTPUT_DIR = OUTPUT_DIR.replace("notebooks/models", "notebooks/../models")  # adjust for model name

Model Configuration: Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20__


In [9]:
def evaluate_model_on_both_datasets(model_config, output_dir, checkpoint_id:int=3640, epoch_id:int=20):
    MODEL_CONFIG = model_config.copy()
    OUTPUT_DIR = os.path.join(output_dir, f"checkpoint-{checkpoint_id}")
    MODEL_CONFIG['n_epochs'] = epoch_id
    MODEL_CONFIG['model_name'] = model_name.replace("ep20", f"ep20({epoch_id})")

    try:
        results_dir = os.path.join(os.getcwd(), '..', 'results')
        with open(os.path.join(results_dir, f'{MODEL_CONFIG["model_name"]}_results.json'), 'r') as f:
            return json.load(f)
        print("Results already exist:", results)
        return
    except Exception as e:
        print("No existing results found. Proceeding with evaluation.", str(e))

    # 11. Load model and tokenizer
    if MODEL_CONFIG['finetuning']:
        tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
        # Load base model first, then load PEFT adapter
        base_model = AutoModelForCausalLM.from_pretrained(
            MODEL_CONFIG['base_model'], 
            device_map="auto", 
            torch_dtype=torch.float16
        )
        base_model.resize_token_embeddings(len(tokenizer))
        model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
    else:
        model = AutoModelForCausalLM.from_pretrained(MODEL_CONFIG['base_model'], device_map="auto", torch_dtype=torch.float16)
        tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG['base_model'], trust_remote_code=True)

    # 12. Evaluate and store results
    with open(TESTDATA_MCQ_FILE, "r") as f:
        test_mcq_dataset = json.load(f)
    accuracy_mcq, se_mcq = evaluate_model(model, tokenizer, test_mcq_dataset, batch_size=MODEL_CONFIG['batch_size'])

    with open(TESTDATA_MCQ_CON_FILE, "r") as f:
        test_mcq_con_dataset = json.load(f)
    accuracy_mcq_con, se_mcq_con = evaluate_model(model, tokenizer, test_mcq_con_dataset, batch_size=MODEL_CONFIG['batch_size'])
    results = {
        "accuracy_mcq": accuracy_mcq,
        "se_mcq": se_mcq,
        "accuracy_mcq_con": accuracy_mcq_con,
        "se_mcq_con": se_mcq_con,
    }
    print("Evaluation Results:", results)
    results_dir = os.path.join(os.getcwd(), '..', 'results')
    os.makedirs(results_dir, exist_ok=True)

    with open(os.path.join(results_dir, f'{MODEL_CONFIG["model_name"]}_results.json'), 'w') as f:
        json.dump(MODEL_CONFIG | results, f, indent=4)
    print("Results stored successfully.")

In [10]:
checkpoints = {
    i: 182*i for i in range(1, 21)
}
for epoch, checkpoint in checkpoints.items():
    print(f"Evaluating checkpoint {checkpoint} for epoch {epoch}")
    evaluate_model_on_both_datasets(MODEL_CONFIG, OUTPUT_DIR, checkpoint_id=checkpoint, epoch_id=epoch)

Evaluating checkpoint 182 for epoch 1
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(1)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:15<00:00,  1.23s/it]
Evaluating model: 100%|██████████| 13/13 [00:31<00:00,  2.39s/it]


Evaluation Results: {'accuracy_mcq': np.float64(48.0), 'se_mcq': np.float64(3.5327043465311387), 'accuracy_mcq_con': np.float64(16.0), 'se_mcq_con': np.float64(2.592296279363144)}
Results stored successfully.
Evaluating checkpoint 364 for epoch 2
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(2)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:15<00:00,  1.23s/it]
Evaluating model: 100%|██████████| 13/13 [00:38<00:00,  2.96s/it]


Evaluation Results: {'accuracy_mcq': np.float64(62.0), 'se_mcq': np.float64(3.4322004603461025), 'accuracy_mcq_con': np.float64(10.5), 'se_mcq_con': np.float64(2.1676600286945367)}
Results stored successfully.
Evaluating checkpoint 546 for epoch 3
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(3)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:14<00:00,  1.12s/it]
Evaluating model: 100%|██████████| 13/13 [00:30<00:00,  2.35s/it]


Evaluation Results: {'accuracy_mcq': np.float64(55.5), 'se_mcq': np.float64(3.514078826662828), 'accuracy_mcq_con': np.float64(11.5), 'se_mcq_con': np.float64(2.255825791146116)}
Results stored successfully.
Evaluating checkpoint 728 for epoch 4
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(4)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:23<00:00,  1.83s/it]
Evaluating model: 100%|██████████| 13/13 [00:53<00:00,  4.14s/it]


Evaluation Results: {'accuracy_mcq': np.float64(57.5), 'se_mcq': np.float64(3.4955328635273903), 'accuracy_mcq_con': np.float64(13.0), 'se_mcq_con': np.float64(2.378024390118823)}
Results stored successfully.
Evaluating checkpoint 910 for epoch 5
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(5)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:18<00:00,  1.43s/it]
Evaluating model: 100%|██████████| 13/13 [00:40<00:00,  3.13s/it]


Evaluation Results: {'accuracy_mcq': np.float64(63.5), 'se_mcq': np.float64(3.40422531569225), 'accuracy_mcq_con': np.float64(15.5), 'se_mcq_con': np.float64(2.5590525590538387)}
Results stored successfully.
Evaluating checkpoint 1092 for epoch 6
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(6)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:35<00:00,  2.74s/it]
Evaluating model: 100%|██████████| 13/13 [01:06<00:00,  5.15s/it]


Evaluation Results: {'accuracy_mcq': np.float64(61.0), 'se_mcq': np.float64(3.448912872196107), 'accuracy_mcq_con': np.float64(16.0), 'se_mcq_con': np.float64(2.592296279363144)}
Results stored successfully.
Evaluating checkpoint 1274 for epoch 7
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(7)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:31<00:00,  2.38s/it]
Evaluating model: 100%|██████████| 13/13 [00:53<00:00,  4.08s/it]


Evaluation Results: {'accuracy_mcq': np.float64(55.5), 'se_mcq': np.float64(3.514078826662828), 'accuracy_mcq_con': np.float64(13.5), 'se_mcq_con': np.float64(2.4163505540380514)}
Results stored successfully.
Evaluating checkpoint 1456 for epoch 8
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(8)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:16<00:00,  1.28s/it]
Evaluating model: 100%|██████████| 13/13 [00:46<00:00,  3.56s/it]


Evaluation Results: {'accuracy_mcq': np.float64(53.0), 'se_mcq': np.float64(3.529164207004259), 'accuracy_mcq_con': np.float64(12.0), 'se_mcq_con': np.float64(2.2978250586152114)}
Results stored successfully.
Evaluating checkpoint 1638 for epoch 9
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(9)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:25<00:00,  2.00s/it]
Evaluating model: 100%|██████████| 13/13 [00:43<00:00,  3.31s/it]


Evaluation Results: {'accuracy_mcq': np.float64(50.0), 'se_mcq': np.float64(3.5355339059327373), 'accuracy_mcq_con': np.float64(20.0), 'se_mcq_con': np.float64(2.82842712474619)}
Results stored successfully.
Evaluating checkpoint 1820 for epoch 10
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(10)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:38<00:00,  2.95s/it]
Evaluating model: 100%|██████████| 13/13 [01:04<00:00,  4.99s/it]


Evaluation Results: {'accuracy_mcq': np.float64(44.5), 'se_mcq': np.float64(3.514078826662828), 'accuracy_mcq_con': np.float64(11.0), 'se_mcq_con': np.float64(2.2124646889837587)}
Results stored successfully.
Evaluating checkpoint 2002 for epoch 11
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(11)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:34<00:00,  2.67s/it]
Evaluating model: 100%|██████████| 13/13 [00:53<00:00,  4.13s/it]


Evaluation Results: {'accuracy_mcq': np.float64(37.0), 'se_mcq': np.float64(3.413942003022312), 'accuracy_mcq_con': np.float64(14.5), 'se_mcq_con': np.float64(2.489728900904675)}
Results stored successfully.
Evaluating checkpoint 2184 for epoch 12
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(12)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:38<00:00,  2.97s/it]
Evaluating model: 100%|██████████| 13/13 [01:03<00:00,  4.86s/it]


Evaluation Results: {'accuracy_mcq': np.float64(38.0), 'se_mcq': np.float64(3.4322004603461025), 'accuracy_mcq_con': np.float64(10.5), 'se_mcq_con': np.float64(2.1676600286945367)}
Results stored successfully.
Evaluating checkpoint 2366 for epoch 13
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(13)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:29<00:00,  2.24s/it]
Evaluating model: 100%|██████████| 13/13 [00:55<00:00,  4.28s/it]


Evaluation Results: {'accuracy_mcq': np.float64(46.0), 'se_mcq': np.float64(3.524202037341219), 'accuracy_mcq_con': np.float64(11.0), 'se_mcq_con': np.float64(2.2124646889837587)}
Results stored successfully.
Evaluating checkpoint 2548 for epoch 14
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(14)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:28<00:00,  2.23s/it]
Evaluating model: 100%|██████████| 13/13 [00:56<00:00,  4.34s/it]


Evaluation Results: {'accuracy_mcq': np.float64(38.5), 'se_mcq': np.float64(3.440748465087211), 'accuracy_mcq_con': np.float64(12.0), 'se_mcq_con': np.float64(2.2978250586152114)}
Results stored successfully.
Evaluating checkpoint 2730 for epoch 15
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(15)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:21<00:00,  1.69s/it]
Evaluating model: 100%|██████████| 13/13 [00:56<00:00,  4.33s/it]


Evaluation Results: {'accuracy_mcq': np.float64(39.0), 'se_mcq': np.float64(3.448912872196107), 'accuracy_mcq_con': np.float64(13.5), 'se_mcq_con': np.float64(2.4163505540380514)}
Results stored successfully.
Evaluating checkpoint 2912 for epoch 16
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(16)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:21<00:00,  1.65s/it]
Evaluating model: 100%|██████████| 13/13 [00:54<00:00,  4.21s/it]


Evaluation Results: {'accuracy_mcq': np.float64(38.5), 'se_mcq': np.float64(3.440748465087211), 'accuracy_mcq_con': np.float64(14.0), 'se_mcq_con': np.float64(2.453568829277059)}
Results stored successfully.
Evaluating checkpoint 3094 for epoch 17
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(17)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:27<00:00,  2.13s/it]
Evaluating model: 100%|██████████| 13/13 [00:44<00:00,  3.40s/it]


Evaluation Results: {'accuracy_mcq': np.float64(36.0), 'se_mcq': np.float64(3.394112549695428), 'accuracy_mcq_con': np.float64(12.5), 'se_mcq_con': np.float64(2.3385358667337135)}
Results stored successfully.
Evaluating checkpoint 3276 for epoch 18
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(18)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:19<00:00,  1.54s/it]
Evaluating model: 100%|██████████| 13/13 [00:47<00:00,  3.62s/it]


Evaluation Results: {'accuracy_mcq': np.float64(37.0), 'se_mcq': np.float64(3.413942003022312), 'accuracy_mcq_con': np.float64(15.5), 'se_mcq_con': np.float64(2.5590525590538387)}
Results stored successfully.
Evaluating checkpoint 3458 for epoch 19
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(19)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:22<00:00,  1.76s/it]
Evaluating model: 100%|██████████| 13/13 [00:40<00:00,  3.10s/it]


Evaluation Results: {'accuracy_mcq': np.float64(38.5), 'se_mcq': np.float64(3.440748465087211), 'accuracy_mcq_con': np.float64(15.0), 'se_mcq_con': np.float64(2.5248762345905194)}
Results stored successfully.
Evaluating checkpoint 3640 for epoch 20
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_LoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(20)___results.json'


Evaluating model: 100%|██████████| 13/13 [00:23<00:00,  1.81s/it]
Evaluating model: 100%|██████████| 13/13 [00:46<00:00,  3.60s/it]

Evaluation Results: {'accuracy_mcq': np.float64(40.5), 'se_mcq': np.float64(3.4711309396218404), 'accuracy_mcq_con': np.float64(14.0), 'se_mcq_con': np.float64(2.453568829277059)}
Results stored successfully.



