In [4]:
import torch, math
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from datasets import load_from_disk
from torch.utils.data import DataLoader

# --- Constants ---
BASE_MODEL = "google/gemma-7b"
ADAPTER_PATH = "/home/liorkob/M.Sc/thesis/gemma_output/clm_lora_gpt_facts/my_gpt_dataset"
DATASET_PATH = "/home/liorkob/M.Sc/thesis/data/hf_datasets/my_gpt_dataset"

# --- Load dataset ---
dataset = load_from_disk(DATASET_PATH)
test_dataset = dataset["test"]

# --- Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token  # just in case

# --- Tokenization ---
def tokenize(example):
    enc = tokenizer(example["text"], truncation=True, max_length=512, padding="max_length")
    enc["labels"] = enc["input_ids"].copy()
    return enc

test_tokenized = test_dataset.map(tokenize, batched=True, remove_columns=["text"])

# --- DataLoader ---
def collate_fn(batch):
    return tokenizer.pad(batch, return_tensors="pt", padding=True)

loader = DataLoader(test_tokenized, batch_size=1, collate_fn=collate_fn)

# --- Eval function ---
def evaluate_model(model, label):
    model.eval()
    losses = []

    for batch in tqdm(loader, desc=f"Evaluating {label}"):
        input_ids = batch["input_ids"].to(model.device)
        attention_mask = batch["attention_mask"].to(model.device)
        labels = batch["labels"].to(model.device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            labels=labels)
            losses.append(outputs.loss.item())

    avg_loss = sum(losses) / len(losses)
    perplexity = math.exp(avg_loss)
    print(f"\n📊 {label} — Loss: {avg_loss:.4f} | Perplexity: {perplexity:.2f}")
    return avg_loss, perplexity

# --- Load base model ---
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto", torch_dtype=torch.float16)
evaluate_model(base_model, "Base Model")

# --- Load fine-tuned model ---
ft_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto", torch_dtype=torch.float16)
ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_PATH)
evaluate_model(ft_model, "Fine-Tuned Model")


Map: 100%|██████████| 299/299 [00:00<00:00, 2275.05 examples/s]
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.11it/s]
Some parameters are on the meta device because they were offloaded to the cpu.
Evaluating Base Model:   0%|          | 0/299 [00:00<?, ?it/s]You're using a GemmaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Evaluating Base Model: 100%|██████████| 299/299 [03:14<00:00,  1.54it/s]



📊 Base Model — Loss: 5.5136 | Perplexity: 248.03


Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.32s/it]
Some parameters are on the meta device because they were offloaded to the cpu.
Some parameters are on the meta device because they were offloaded to the cpu.
Evaluating Fine-Tuned Model: 100%|██████████| 299/299 [03:04<00:00,  1.62it/s]


📊 Fine-Tuned Model — Loss: 5.2122 | Perplexity: 183.50





(5.212235548424482, 183.50383169063318)