In [1]:
import json
import re
import torch

# Assuming you have installed 'unsloth' and 'trl'
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

# File that contains the 100 multiple-choice questions
TEST_FILE = "psychology_test_multiple_choice.json"

# Define a multiple-choice chat prompt
# The prompt instructs the model to respond with a single correct letter.
chat_prompt = """\
### Instruction:
You are a helpful AI assistant. Read the multiple choice question and options below. 
Provide only the single best answer (A, B, C, or D). Just giving me an alphabet is enough.
Example is this.
Response: A, B, C, or D

### Input:
Question: {question}
Options:
{options_str}

### Response:
"""

def load_test_data(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return json.load(f)

def get_letter_from_response(text):
    # Simple approach: search for an isolated letter A-D
    match = re.search(r"\b[ABCD]\b", text, re.IGNORECASE)
    if match:
        return match.group(0).upper()
    return None

def evaluate_model(model, tokenizer, test_data, device="cuda", model_name="Model"):
    """
    Evaluate the model on multiple-choice items.
    Returns the accuracy (fraction correct).
    """
    correct = 0
    total = len(test_data)
    invalid_responses = 0
    
    for item in test_data:
        q = item["question"]
        opts = "\n".join(item["options"])
        true_answer = item["answer"].upper()

        # Construct the prompt
        prompt = chat_prompt.format(question=q, options_str=opts)

        # Tokenize
        inputs = tokenizer([prompt], return_tensors="pt").to(device)

        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=64,
                use_cache=True
            )

        # Decode
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

        if "### Response:" in decoded:
            model_response = decoded.split("### Response:")[-1].strip()
        else:
            model_response = decoded

        predicted_letter = get_letter_from_response(model_response)

        if predicted_letter is None:
            invalid_responses += 1  # Count invalid responses
        elif predicted_letter == true_answer:
            correct += 1

    accuracy = correct / total
    print(f"[{model_name}] Accuracy: {accuracy:.2%}  ({correct}/{total})")
    print(f"[{model_name}] Invalid Responses: {invalid_responses}/{total} ({invalid_responses/total:.2%})")
    # print(f"[{model_name}] Accuracy: {accuracy:.2%}  ({correct}/{total})")
    return accuracy



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
def main():
    # Load test data
    test_data = load_test_data(TEST_FILE)

    device = "cuda" if torch.cuda.is_available() else "cpu"

    print("Loading original model...")
    original_model_name = "unsloth/Phi-4"

    max_seq_length = 2048
    dtype = None
    load_in_4bit = True
################## 
    orig_model, orig_tokenizer = FastLanguageModel.from_pretrained(
        model_name=original_model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )

    # Prepare it for inference (disable training settings)
    FastLanguageModel.for_inference(orig_model)
    orig_model.to(device)

    # Evaluate original model
    print("Evaluating original model...")
    evaluate_model(orig_model, orig_tokenizer, test_data, device=device, model_name="Original Model")
    
################
    # ----------------------------------------------------------------------
    # 2) Load the FINE-TUNED model
    # ----------------------------------------------------------------------
    # This folder "lora_model_osloth" is whatever you saved from your fine-tuning.
    # Make sure it matches your actual path.
    # print("\nLoading fine-tuned model (LoRA)...")
    # fine_tuned_model_name = "lora_model_osloth_psychology"

    # finetune_model, finetune_tokenizer = FastLanguageModel.from_pretrained(
    #     model_name=fine_tuned_model_name,
    #     max_seq_length=max_seq_length,
    #     dtype=dtype,
    #     load_in_4bit=load_in_4bit,
    # )

    # # Prepare for inference
    # FastLanguageModel.for_inference(finetune_model)
    # finetune_model.to(device)

    # # Evaluate fine-tuned model
    # print("Evaluating fine-tuned model...")
    # evaluate_model(finetune_model, finetune_tokenizer, test_data, device=device, model_name="Fine-Tuned Model")
####################################
if __name__ == "__main__":
    main()

Loading original model...
==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.48.0.
   \\   /|    GPU: NVIDIA A40. Max memory: 44.352 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating original model...
[Original Model] Accuracy: 99.00%  (99/100)
[Original Model] Invalid Responses: 0/100 (0.00%)


### Original

In [3]:
test_data = load_test_data(TEST_FILE)
device = "cuda" if torch.cuda.is_available() else "cpu"

print("Loading original model...")
original_model_name = "unsloth/llama-3-8b-bnb-4bit"
max_seq_length = 2048
dtype = None
load_in_4bit = True

orig_model, orig_tokenizer = FastLanguageModel.from_pretrained(
    model_name=original_model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

FastLanguageModel.for_inference(orig_model)
orig_model.to(device)

Loading original model...
==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.48.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.581 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [19]:
for item in test_data[:30]:
    q = item["question"]
    opts = "\n".join(item["options"])
    true_answer = item["answer"].upper()

    # Construct the prompt
    prompt = chat_prompt.format(question=q, options_str=opts)

    # Tokenize
    inputs = orig_tokenizer([prompt], return_tensors="pt").to(device)

    # Generate
    with torch.no_grad():
        outputs = orig_model.generate(
            **inputs,
            max_new_tokens=64,
            use_cache=True
        )
    # Decode
    decoded = orig_tokenizer.decode(outputs[0], skip_special_tokens=True)

    if "### Response:" in decoded:
        model_response = decoded.split("### Response:")[-1].strip()
    else:
        model_response = decoded

    predicted_letter = get_letter_from_response(model_response)

    if predicted_letter != true_answer:
        print("==========================")
        print(f"Correct Answer: {true_answer}")
        print(f"Predicted letter: {predicted_letter}")
        # print(f"Decoded Text: {decoded}" )

Correct Answer: C
Predicted letter: D
Correct Answer: C
Predicted letter: A
Correct Answer: B
Predicted letter: A
Correct Answer: B
Predicted letter: A
Correct Answer: D
Predicted letter: A
Correct Answer: B
Predicted letter: D
Correct Answer: C
Predicted letter: A
Correct Answer: D
Predicted letter: C


In [None]:
print("Evaluating original model...")
evaluate_model(orig_model, orig_tokenizer, test_data, device=device, model_name="Original Model")

### Fine-tuned 

In [20]:
print("\nLoading fine-tuned model (LoRA)...")
fine_tuned_model_name = "lora_model_osloth"

finetune_model, finetune_tokenizer = FastLanguageModel.from_pretrained(
    model_name=fine_tuned_model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Prepare for inference
FastLanguageModel.for_inference(finetune_model)
finetune_model.to(device)


Loading fine-tuned model (LoRA)...
==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.48.0.
   \\   /|    GPU: NVIDIA A40. Max memory: 44.352 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128255)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [23]:
for item in test_data[:30]:
    q = item["question"]
    opts = "\n".join(item["options"])
    true_answer = item["answer"].upper()

    # Construct the prompt
    prompt = chat_prompt.format(question=q, options_str=opts)

    # Tokenize
    inputs = finetune_tokenizer([prompt], return_tensors="pt").to(device)

    # Generate
    with torch.no_grad():
        outputs = finetune_model.generate(
            **inputs,
            max_new_tokens=64,
            use_cache=True
        )
    # Decode
    decoded = finetune_tokenizer.decode(outputs[0], skip_special_tokens=True)

    if "### Response:" in decoded:
        model_response = decoded.split("### Response:")[-1].strip()
    else:
        model_response = decoded

    predicted_letter = get_letter_from_response(model_response)

    if predicted_letter != true_answer:
        print("==========================")
        print(f"Correct Answer: {true_answer}")
        print(f"Predicted letter: {predicted_letter}")
        # print(f"Decoded Text: {decoded}" )

Correct Answer: C
Predicted letter: A
Correct Answer: D
Predicted letter: A
Correct Answer: B
Predicted letter: A
Correct Answer: B
Predicted letter: A
Correct Answer: C
Predicted letter: B
Correct Answer: D
Predicted letter: A
Correct Answer: D
Predicted letter: None
Correct Answer: D
Predicted letter: A


In [None]:
# Evaluate fine-tuned model
print("Evaluating fine-tuned model...")
evaluate_model(finetune_model, finetune_tokenizer, test_data, device=device, model_name="Fine-Tuned Model")