In [5]:
from datasets import load_dataset
import torch
import os
import pandas as pd
import torch
import random
import numpy as np

# Fix all seeds
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed_all(seed)
pd.options.display.max_colwidth = None


In [6]:
socialiqa = load_dataset("allenai/social_i_qa", trust_remote_code=True)

In [7]:
import json
import re
import torch
from datasets import load_dataset
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

# Updated prompt including context and three answer choices
chat_prompt = """
### Instruction:
{}

### Context:
{}

### Question:
{}

### Choices:
A: {}
B: {}
C: {}

### Response:
{}"""

def get_letter_from_response(text):
    # Look for an isolated letter A-C (ignore D and E as SocialIQA has three options)
    match = re.search(r"\b[ABC]\b", text, re.IGNORECASE)
    if match:
        return match.group(0).upper()
    return None

def get_true_answer(label):
    # Convert the string label to int and map to a letter
    label_idx = int(label)
    mapping = {1: 'A', 2: 'B', 3: 'C'}
    return mapping.get(label_idx, None)

def evaluate_model(model, tokenizer, dataset, device="cuda", model_name="Model"):
    correct = 0
    total = len(dataset)
    invalid_responses = 0
    instruction = "Answer the multiple-choice question below based on the provided context. Be emphathetic. Your response must be a single letter: A, B, or C."
    
    for i in range(total):
        context = dataset[i]['context']
        question = dataset[i]['question']
        answerA = dataset[i]['answerA']
        answerB = dataset[i]['answerB']
        answerC = dataset[i]['answerC']
        true_answer = get_true_answer(dataset[i]['label'])

        prompt = chat_prompt.format(instruction, context, question, answerA, answerB, answerC, "")
        inputs = tokenizer([prompt], return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=64,
                use_cache=True
            )

        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

        if "### Response:" in decoded:
            model_response = decoded.split("### Response:")[-1].strip()
        else:
            model_response = decoded.strip()
            
        predicted_letter = get_letter_from_response(model_response)

        if predicted_letter is None:
            invalid_responses += 1
        elif predicted_letter == true_answer:
            correct += 1
        # print(f"model response {model_response}")
        # print(f"predicted_letter: {predicted_letter}")
        # print(f"true_answer: {true_answer}")
    accuracy = correct / total
    print(f"[{model_name}] Accuracy: {accuracy:.2%} ({correct}/{total})")
    print(f"[{model_name}] Invalid Responses: {invalid_responses}/{total} ({invalid_responses/total:.2%})")
    return accuracy


In [None]:
# import argparse
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset

def load_dataset_validation(dataset_name, split):
    """Load the validation split of a dataset."""
    print(f"Loading dataset: {dataset_name} [{split}]")
    dataset = load_dataset(dataset_name, split=split)
    return dataset

def load_model(model_name, max_seq_length=2048, dtype=None, load_in_4bit=True, device="cuda"):
    """Load a model and prepare it for inference."""
    print(f"Loading model: {model_name}")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    FastLanguageModel.for_inference(model)
    model.to(device)
    return model, tokenizer

def main():
    # Load test data
    validation_data = load_dataset_validation("allenai/social_i_qa", "validation")
    # print("validation data 1: ",validation_data)

    validation_data = validation_data.select(range(100))
    # print("validation data 2: ",validation_data)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    max_seq_length = 2048
    dtype = None
    load_in_4bit = True
##################
    # print("Loading original model...")
    # original_model_name = "unsloth/Phi-4"
    # original_model, original_tokenizer = load_model(
    #     model_name=original_model_name,
    #     max_seq_length=max_seq_length,
    #     load_in_4bit=load_in_4bit,
    #     device=device,
    # )
    # FastLanguageModel.for_inference(original_model)
    # original_model.to(device)

    # print("\nEvaluating original model...")
    # evaluate_model(original_model, original_tokenizer, validation_data, device=device, model_name="Original Model")
#########################
    # ----------------------------------------------------------------------
    # 2) Load the FINE-TUNED model
    # ----------------------------------------------------------------------

    print("\nLoading fine-tuned model (LoRA)...")
    fine_tuned_model_name = "phi_lora_social_qa_64"

    finetune_model, finetune_tokenizer = FastLanguageModel.from_pretrained(
        model_name=fine_tuned_model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    FastLanguageModel.for_inference(finetune_model)
    finetune_model.to(device)

    print("\nEvaluating fine-tuned model...")
    evaluate_model(finetune_model, finetune_tokenizer, validation_data, device=device, model_name="Fine-Tuned Model")
########################
if __name__ == "__main__":
    main()

Loading dataset: allenai/social_i_qa [validation]

Loading fine-tuned model (LoRA)...
==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.48.0.
   \\   /|    GPU: NVIDIA A40. Max memory: 44.352 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]