In [1]:
from datasets import load_dataset
import torch
import os
import pandas as pd
import torch
import random
import numpy as np

# Fix all seeds
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed_all(seed)
pd.options.display.max_colwidth = None


In [2]:
trained_folder = "./fine-tuned/lora_model_osloth_commonsense_qa"
ds = load_dataset("tau/commonsense_qa")

In [3]:
ds['validation']['choices'][0]

{'label': ['A', 'B', 'C', 'D', 'E'],
 'text': ['bank', 'library', 'department store', 'mall', 'new york']}

In [4]:
import json
import re
import torch

# Assuming you have installed 'unsloth' and 'trl'
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

chat_prompt = """
### Instruction:
{}

### Question:
{}

### Choices:
{}

### Response:
{}"""

def get_letter_from_response(text):
    # Simple approach: search for an isolated letter A-D
    match = re.search(r"\b[ABCDE]\b", text, re.IGNORECASE)
    if match:
        return match.group(0).upper()
    return None

def evaluate_model(model, tokenizer, dataset, device="cuda", model_name="Model"):
    correct = 0
    total = len(dataset['id'])
    invalid_responses = 0
    instruction = "Answer the multiple-choice question below based on the provided context."
    for i in range(total):
        question = dataset['question'][i]
        choices = dataset['choices'][i]['text']
        labels = dataset['choices'][i]['label']
        options_str = "\n".join(f"{label}: {text}" for label, text in zip(labels, choices))
        true_answer = dataset['answerKey'][i]

        prompt = chat_prompt.format(instruction, question, options_str, "")

        inputs = tokenizer([prompt], return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=64,
                use_cache=True
            )

        # Decode
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

        if "### Response:" in decoded:
            model_response = decoded.split("### Response:")[-1].strip()
        else:
            model_response = decoded

        predicted_letter = get_letter_from_response(model_response)

        if predicted_letter is None:
            invalid_responses += 1
        elif predicted_letter == true_answer:
            correct += 1

    accuracy = correct / total
    print(f"[{model_name}] Accuracy: {accuracy:.2%}  ({correct}/{total})")
    print(f"[{model_name}] Invalid Responses: {invalid_responses}/{total} ({invalid_responses/total:.2%})")
    # print(f"[{model_name}] Accuracy: {accuracy:.2%}  ({correct}/{total})")
    return accuracy



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [5]:
# import argparse
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset

def load_dataset_validation(dataset_name, split):
    """Load the validation split of a dataset."""
    print(f"Loading dataset: {dataset_name} [{split}]")
    dataset = load_dataset(dataset_name, split=split)
    return dataset

def load_model(model_name, max_seq_length=2048, dtype=None, load_in_4bit=True, device="cuda"):
    """Load a model and prepare it for inference."""
    print(f"Loading model: {model_name}")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    FastLanguageModel.for_inference(model)
    model.to(device)
    return model, tokenizer

def main():
    # Load test data
    validation_data = load_dataset_validation("tau/commonsense_qa", "validation")
    validation_data = validation_data[:100]

    device = "cuda" if torch.cuda.is_available() else "cpu"

    print("Loading original model...")
    original_model_name = "llama-3-8b-bnb-4bit"

    max_seq_length = 2048
    dtype = None
    load_in_4bit = True

    original_model, original_tokenizer = load_model(
        model_name=original_model_name,
        max_seq_length=max_seq_length,
        load_in_4bit=load_in_4bit,
        device=device,
    )
    FastLanguageModel.for_inference(original_model)
    original_model.to(device)

    print("\nEvaluating original model...")
    evaluate_model(original_model, original_tokenizer, validation_data, device=device, model_name="Original Model")

    # ----------------------------------------------------------------------
    # 2) Load the FINE-TUNED model
    # ----------------------------------------------------------------------
    # This folder "lora_model_osloth" is whatever you saved from your fine-tuning.
    # Make sure it matches your actual path.

    print("\nLoading fine-tuned model (LoRA)...")
    fine_tuned_model_name = "lora_model_osloth_commonsense_qa"

    finetune_model, finetune_tokenizer = FastLanguageModel.from_pretrained(
        model_name=fine_tuned_model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    FastLanguageModel.for_inference(finetune_model)
    finetune_model.to(device)

    print("\nEvaluating fine-tuned model...")
    evaluate_model(finetune_model, finetune_tokenizer, validation_data, device=device, model_name="Fine-Tuned Model")

if __name__ == "__main__":
    main()

Loading dataset: tau/commonsense_qa [validation]
Loading original model...
Loading model: llama-3-8b-bnb-4bit


FileNotFoundError: llama-3-8b-bnb-4bit/*.json (invalid repository id)