In [2]:
from datasets import load_dataset
import torch
import os
import pandas as pd
import torch
import random
import numpy as np

# Fix all seeds
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed_all(seed)
pd.options.display.max_colwidth = None


In [3]:
# trained_folder = "./fine-tuned/lora_model_osloth_commonsense_qa"
ds = load_dataset("tau/commonsense_qa")

In [4]:
import json
import re
import torch

# Assuming you have installed 'unsloth' and 'trl'
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

chat_prompt = """
### Instruction:
{}

### Question:
{}

### Choices:
{}

### Response:
{}"""

def load_dataset_validation(dataset_name, split):
    """Load the validation split of a dataset."""
    print(f"Loading dataset: {dataset_name} [{split}]")
    dataset = load_dataset(dataset_name, split=split)
    return dataset

def load_model(model_name, max_seq_length=2048, dtype=None, load_in_4bit=True, device="cuda"):
    """Load a model and prepare it for inference."""
    print(f"Loading model: {model_name}")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    FastLanguageModel.for_inference(model)
    model.to(device)
    return model, tokenizer

def get_letter_from_response(text):
    # Simple approach: search for an isolated letter A-D
    match = re.search(r"\b[ABCDE]\b", text, re.IGNORECASE)
    if match:
        return match.group(0).upper()
    return None

def evaluate_model(model, tokenizer, dataset, device="cuda", model_name="Model"):
    correct = 0
    total = len(dataset['id'])
    invalid_responses = 0
    instruction = "Answer the multiple-choice question below based on the provided context. Your response has to be A, B, C, D, or E"
    for i in range(total):
        question = dataset['question'][i]
        choices = dataset['choices'][i]['text']
        labels = dataset['choices'][i]['label']
        options_str = "\n".join(f"{label}: {text}" for label, text in zip(labels, choices))
        true_answer = dataset['answerKey'][i]
        prompt = chat_prompt.format(instruction, question, options_str, "")
        inputs = tokenizer([prompt], return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=64,
                use_cache=True
            )

        # Decode
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

        if "### Response:" in decoded:
            model_response = decoded.split("### Response:")[-1].strip()
        else:
            model_response = decoded
        # print(f"Model response: {model_response}")
        predicted_letter = get_letter_from_response(model_response)

        if predicted_letter is None:
            invalid_responses += 1
        elif predicted_letter == true_answer:
            correct += 1
        # print(f"predicted answer is : {predicted_letter}, true_answer: {true_answer}")

    accuracy = correct / total
    print(f"[{model_name}] Accuracy: {accuracy:.2%}  ({correct}/{total})")
    print(f"[{model_name}] Invalid Responses: {invalid_responses}/{total} ({invalid_responses/total:.2%})")
    # print(f"[{model_name}] Accuracy: {accuracy:.2%}  ({correct}/{total})")
    return accuracy




def create_logic_df(model, tokenizer, dataset, device="cuda", model_name="Model"):
    results = []
    
    instruction = (
        "Read the following multiple-choice question and its context carefully and answer with explanation"
    )
    
    for i in range(len(dataset['id'])):
        question = dataset['question'][i]
        choices = dataset['choices'][i]['text']
        labels = dataset['choices'][i]['label']
        options_str = "\n".join(f"{label}: {text}" for label, text in zip(labels, choices))
        prompt = chat_prompt.format(instruction, question, options_str, "")
        
        inputs = tokenizer([prompt], return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=64,
                use_cache=True
            )
        
        # Decode model response
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        if "### Response:" in decoded:
            model_response = decoded.split("### Response:")[-1].strip()
        else:
            model_response = decoded
        
        # Extract predicted letter and reasoning
        predicted_letter = get_letter_from_response(model_response)
        reasoning = model_response.replace(predicted_letter, "").strip() if predicted_letter else model_response
        
        # Append results
        results.append({
            "id": dataset['id'][i],
            "question": question,
            "answer": predicted_letter
        })
    
    # Create DataFrame
    df = pd.DataFrame(results)
    return df





🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
# import argparse
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset


# Load test data
validation_data = load_dataset_validation("tau/commonsense_qa", "validation")
validation_data = validation_data[:100]

device = "cuda" if torch.cuda.is_available() else "cpu"
max_seq_length = 2048
dtype = None
load_in_4bit = True

print("\nLoading fine-tuned model (LoRA)...")
# fine_tuned_model_name = "phi_lora_commonsense_qa"
fine_tuned_model_name = "phi_lora_social_qa"

finetune_model, finetune_tokenizer = FastLanguageModel.from_pretrained(
    model_name=fine_tuned_model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
FastLanguageModel.for_inference(finetune_model)
finetune_model.to(device)

print("\nCreating Df with a fine-tuned model...")
logic_df = create_logic_df(finetune_model, finetune_tokenizer, validation_data, device="cuda", model_name="Fine-Tuned Model")


Loading dataset: tau/commonsense_qa [validation]

Loading fine-tuned model (LoRA)...
==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.48.0.
   \\   /|    GPU: NVIDIA A40. Max memory: 44.352 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Unsloth: Will load phi_lora_commonsense_qa as a legacy tokenizer.
Unsloth 2025.2.12 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.



Creating Df with a fine-tuned model...


In [6]:
logic_df

Unnamed: 0,id,question,answer
0,1afa02df02c908a558b4036e80242fac,"A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?",A
1,a7ab086045575bb497933726e4e6ad28,What do people aim to do at work?,A
2,b8c0a4703079cf661d7261a60a1bcbff,Where would you find magazines along side many other printed works?,B
3,e68fb2448fd74e402aae9982aa76e527,Where are you likely to find a hamburger?,A
4,2435de612dd69f2012b9e40d6af4ce38,James was looking for a good place to buy farmland. Where might he look?,A
...,...,...,...
95,5169f7ae0781b15161551de3a189ebef,What do you want someone to do when you illustrate point?,C
96,ef22ef7aeec70aaa688720f805c1cf38,Billy set aside a block of time for having fun after work. Why might he do this?,B
97,514310637fb43a252bfadc8cbf79b277,"The man in the white suit was very lazy. He did nothing useful. Meanwhile, the ban in the blue had put in effort and was very what?",D
98,9370b2b0897b796dec4a40f107854c8d,What would you be unable to do if you have too much greed?,B


In [8]:
logic_df.to_csv("csvs/phi_common_logic_q.csv")

In [9]:
# import argparse
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset

def main():
    # Load test data
    validation_data = load_dataset_validation("tau/commonsense_qa", "validation")
    validation_data = validation_data[:100]

    device = "cuda" if torch.cuda.is_available() else "cpu"
    max_seq_length = 2048
    dtype = None
    load_in_4bit = True
##################
    # print("Loading original model...")
    # original_model_name = "unsloth/Phi-4"
    # original_model, original_tokenizer = load_model(
    #     model_name=original_model_name,
    #     max_seq_length=max_seq_length,
    #     load_in_4bit=load_in_4bit,
    #     device=device,
    # )
    # FastLanguageModel.for_inference(original_model)
    # original_model.to(device)

    # print("\nEvaluating original model...")
    # evaluate_model(original_model, original_tokenizer, validation_data, device=device, model_name="Original Model")
#########################
    # ----------------------------------------------------------------------
    # 2) Load the FINE-TUNED model
    # ----------------------------------------------------------------------

    print("\nLoading fine-tuned model (LoRA)...")
    # fine_tuned_model_name = "phi_lora_commonsense_qa"
    fine_tuned_model_name = "phi_lora_social_qa"
    
    finetune_model, finetune_tokenizer = FastLanguageModel.from_pretrained(
        model_name=fine_tuned_model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    FastLanguageModel.for_inference(finetune_model)
    finetune_model.to(device)

    print("\nEvaluating fine-tuned model...")
    evaluate_model(finetune_model, finetune_tokenizer, validation_data, device=device, model_name="Fine-Tuned Model")
########################
if __name__ == "__main__":
    main()

Loading dataset: tau/commonsense_qa [validation]

Loading fine-tuned model (LoRA)...
==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.48.0.
   \\   /|    GPU: NVIDIA A40. Max memory: 44.352 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Unsloth: Will load phi_lora_social_qa as a legacy tokenizer.



Evaluating fine-tuned model...
[Fine-Tuned Model] Accuracy: 76.00%  (76/100)
[Fine-Tuned Model] Invalid Responses: 0/100 (0.00%)
