# 5.1 - Evaluate finetuned models over each checkpoint (1 for epoch)

In [1]:
import os
import json
import sys
sys.path.append(os.path.join(os.getcwd(), '..', 'src'))
from evaluation_function import evaluate_model
from utils import define_model_name
from store_load_results import store_results, load_results

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
)
from peft import PeftModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
TRAINDATA_FILE = os.path.join(os.getcwd(), '..', 'data', 'train_test_dataset', 'orange_qa_train.jsonl')
TESTDATA_MCQ_FILE = os.path.join(os.getcwd(), '..', 'data', 'train_test_dataset', 'orange_qa_MCQ_test.jsonl')
TESTDATA_MCQ_CON_FILE = os.path.join(os.getcwd(), '..', 'data', 'train_test_dataset', 'orange_qa_MCQ-con_test.jsonl')

In [3]:
LORA_PROJECTIONS = "qvko"
PROJECTIONS = {
    "q": "q_proj",
    "k": "k_proj",
    "v": "v_proj",
    "o": "o_proj",
    "g": "gate_proj",
    "d": "down_proj",
    "u": "up_proj"
}
projections = [PROJECTIONS[p] for p in list(LORA_PROJECTIONS)]

MODEL_CONFIG = {
    "base_model": "Qwen/Qwen3-0.6B",
    "finetuning": True,
    "use_dora": True,
    "n_epochs": 20,
    "lora_r": 8,
    "lora_alpha": 16,
    "lr": 0.0001,
    "batch_size": 16,
    "lora_projections": projections,
    "lora_dropout": 0.05,
    "new_tokens_path":  None,
    "new_tokens_init": None,
    "new_tokens_train": None,
    "wandb_project": None,  # wandb project name
}
model_name, OUTPUT_DIR = define_model_name(MODEL_CONFIG)
OUTPUT_DIR = OUTPUT_DIR.replace("notebooks/models", "notebooks/../models")  # adjust for model name

Model Configuration: Qwen3-0.6B_DoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20__


In [4]:
def evaluate_model_on_both_datasets(model_config, output_dir, checkpoint_id:int=3640, epoch_id:int=20):
    MODEL_CONFIG = model_config.copy()
    OUTPUT_DIR = os.path.join(output_dir, f"checkpoint-{checkpoint_id}")
    MODEL_CONFIG['n_epochs'] = epoch_id
    MODEL_CONFIG['model_name'] = model_name.replace("ep20", f"ep20({epoch_id})")

    try:
        results_dir = os.path.join(os.getcwd(), '..', 'results')
        with open(os.path.join(results_dir, f'{MODEL_CONFIG["model_name"]}_results.json'), 'r') as f:
            return json.load(f)
        print("Results already exist:", results)
        return
    except Exception as e:
        print("No existing results found. Proceeding with evaluation.", str(e))

    # 11. Load model and tokenizer
    if MODEL_CONFIG['finetuning']:
        tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
        # Load base model first, then load PEFT adapter
        base_model = AutoModelForCausalLM.from_pretrained(
            MODEL_CONFIG['base_model'], 
            device_map="auto", 
            torch_dtype=torch.float16
        )
        base_model.resize_token_embeddings(len(tokenizer))
        model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
    else:
        model = AutoModelForCausalLM.from_pretrained(MODEL_CONFIG['base_model'], device_map="auto", torch_dtype=torch.float16)
        tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG['base_model'], trust_remote_code=True)

    # 12. Evaluate and store results
    with open(TESTDATA_MCQ_FILE, "r") as f:
        test_mcq_dataset = json.load(f)
    accuracy_mcq, se_mcq = evaluate_model(model, tokenizer, test_mcq_dataset, batch_size=MODEL_CONFIG['batch_size'])

    with open(TESTDATA_MCQ_CON_FILE, "r") as f:
        test_mcq_con_dataset = json.load(f)
    accuracy_mcq_con, se_mcq_con = evaluate_model(model, tokenizer, test_mcq_con_dataset, batch_size=MODEL_CONFIG['batch_size'])
    results = {
        "accuracy_mcq": accuracy_mcq,
        "se_mcq": se_mcq,
        "accuracy_mcq_con": accuracy_mcq_con,
        "se_mcq_con": se_mcq_con,
    }
    print("Evaluation Results:", results)
    results_dir = os.path.join(os.getcwd(), '..', 'results')
    os.makedirs(results_dir, exist_ok=True)

    with open(os.path.join(results_dir, f'{MODEL_CONFIG["model_name"]}_results.json'), 'w') as f:
        json.dump(MODEL_CONFIG | results, f, indent=4)
    print("Results stored successfully.")

In [None]:
checkpoints = {
    i: 182*i for i in range(1, 21)
}
for epoch, checkpoint in checkpoints.items():
    print(f"Evaluating checkpoint {checkpoint} for epoch {epoch}")
    evaluate_model_on_both_datasets(MODEL_CONFIG, OUTPUT_DIR, checkpoint_id=checkpoint, epoch_id=epoch)

Evaluating checkpoint 182 for epoch 1
Evaluating checkpoint 364 for epoch 2
Evaluating checkpoint 546 for epoch 3
Evaluating checkpoint 728 for epoch 4
Evaluating checkpoint 910 for epoch 5
Evaluating checkpoint 1092 for epoch 6
Evaluating checkpoint 1274 for epoch 7
No existing results found. Proceeding with evaluation. [Errno 2] No such file or directory: '/root/dev/Tutorials/LoRA-finetuning-tutorial/notebooks/../results/Qwen3-0.6B_DoRA_qvko_r8_alpha16_drop0.05_proj(qvko)_bs16_lr0.0001_ep20(7)___results.json'


`torch_dtype` is deprecated! Use `dtype` instead!
Evaluating model:   0%|          | 0/13 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating model:  38%|███▊      | 5/13 [01:40<04:17, 32.18s/it]