In [1]:
from datasets import load_dataset
import torch
import os
import pandas as pd
import torch
import random
import numpy as np

# Fix all seeds
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed_all(seed)
pd.options.display.max_colwidth = None


ModuleNotFoundError: No module named 'datasets'

In [2]:
# trained_folder = "./fine-tuned/lora_model_osloth_commonsense_qa"
ds = load_dataset("tau/commonsense_qa")

NameError: name 'load_dataset' is not defined

In [3]:
import json
import re
import torch

# Assuming you have installed 'unsloth' and 'trl'
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

chat_prompt = """
### Instruction:
{}

### Question:
{}

### Choices:
{}

### Response:
{}"""

def load_dataset_validation(dataset_name, split):
    """Load the validation split of a dataset."""
    print(f"Loading dataset: {dataset_name} [{split}]")
    dataset = load_dataset(dataset_name, split=split)
    return dataset

def load_model(model_name, max_seq_length=2048, dtype=None, load_in_4bit=True, device="cuda"):
    """Load a model and prepare it for inference."""
    print(f"Loading model: {model_name}")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    FastLanguageModel.for_inference(model)
    model.to(device)
    return model, tokenizer

def get_letter_from_response(text):
    # Simple approach: search for an isolated letter A-D
    match = re.search(r"\b[ABCDE]\b", text, re.IGNORECASE)
    if match:
        return match.group(0).upper()
    return None

def evaluate_model(model, tokenizer, dataset, device="cuda", model_name="Model"):
    correct = 0
    total = len(dataset['id'])
    invalid_responses = 0
    instruction = "Answer the multiple-choice question below based on the provided context. Your response has to be A, B, C, D, or E"
    for i in range(total):
        question = dataset['question'][i]
        choices = dataset['choices'][i]['text']
        labels = dataset['choices'][i]['label']
        options_str = "\n".join(f"{label}: {text}" for label, text in zip(labels, choices))
        true_answer = dataset['answerKey'][i]
        prompt = chat_prompt.format(instruction, question, options_str, "")
        inputs = tokenizer([prompt], return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=64,
                use_cache=True
            )

        # Decode
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

        if "### Response:" in decoded:
            model_response = decoded.split("### Response:")[-1].strip()
        else:
            model_response = decoded
        # print(f"Model response: {model_response}")
        predicted_letter = get_letter_from_response(model_response)

        if predicted_letter is None:
            invalid_responses += 1
        elif predicted_letter == true_answer:
            correct += 1
        # print(f"predicted answer is : {predicted_letter}, true_answer: {true_answer}")

    accuracy = correct / total
    print(f"[{model_name}] Accuracy: {accuracy:.2%}  ({correct}/{total})")
    print(f"[{model_name}] Invalid Responses: {invalid_responses}/{total} ({invalid_responses/total:.2%})")
    # print(f"[{model_name}] Accuracy: {accuracy:.2%}  ({correct}/{total})")
    return accuracy




def create_logic_df(model, tokenizer, dataset, device="cuda", model_name="Model"):
    results = []
    
    instruction = (
        "Read the following multiple-choice question and its context carefully and answer with explanation"
    )
    
    for i in range(len(dataset['id'])):
        question = dataset['question'][i]
        choices = dataset['choices'][i]['text']
        labels = dataset['choices'][i]['label']
        options_str = "\n".join(f"{label}: {text}" for label, text in zip(labels, choices))
        prompt = chat_prompt.format(instruction, question, options_str, "")
        
        inputs = tokenizer([prompt], return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=64,
                use_cache=True
            )
        
        # Decode model response
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        if "### Response:" in decoded:
            model_response = decoded.split("### Response:")[-1].strip()
        else:
            model_response = decoded
        
        # Extract predicted letter and reasoning
        predicted_letter = get_letter_from_response(model_response)
        reasoning = model_response.replace(predicted_letter, "").strip() if predicted_letter else model_response
        
        # Append results
        results.append({
            "id": dataset['id'][i],
            "question": question,
            "answer": predicted_letter
        })
    
    # Create DataFrame
    df = pd.DataFrame(results)
    return df





🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
def create_logic_df_with_psych_knowledge(model, tokenizer, dataset, df_psych, device="cuda", model_name="Model"):
    results = []
    
    for i in range(len(dataset['id'])):
        question = dataset['question'][i]
        choices = dataset['choices'][i]['text']
        labels = dataset['choices'][i]['label']
        options_str = "\n".join(f"{label}: {text}" for label, text in zip(labels, choices))
        psych_exp_answer = df_psych.iloc[i]["answer"]
        prompt = (
        "### Instruction:\n"
        "Answer the multiple-choice question below based on the provided context"
        "You can learn from how another expert thinks"
        "Your response has to be A, B, C, D, or E\n\n"
        f"### Question:\n{question}\n\n"
        f"### Choices:\n"
        f"{options_str}\n\n"
        f"### Thoughts of Psychological expert:\n"
        f"An expert in Psychological Understanding thinks the answer is {psych_exp_answer}.\n"
        f"### Response:\n"
        )
        
        inputs = tokenizer([prompt], return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=64,
                use_cache=True
            )
        
        # Decode model response
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        if "### Response:" in decoded:
            model_response = decoded.split("### Response:")[-1].strip()
        else:
            model_response = decoded
        
        # Extract predicted letter and reasoning
        predicted_letter = get_letter_from_response(model_response)
        reasoning = model_response.replace(predicted_letter, "").strip() if predicted_letter else model_response
        
        # Append results
        results.append({
            "id": dataset['id'][i],
            "question": question,
            "answer": predicted_letter
        })
    
    # Create DataFrame
    df = pd.DataFrame(results)
    return df

In [5]:
import torch
import gc

# Clear PyTorch cache
torch.cuda.empty_cache()

# Force garbage collection
gc.collect()

0

In [None]:
# import argparse
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset


# Load test data
validation_data = load_dataset_validation("tau/commonsense_qa", "validation")
validation_data = validation_data[:1000]

device = "cuda" if torch.cuda.is_available() else "cpu"
max_seq_length = 64
dtype = None
load_in_4bit = True

print("\nLoading fine-tuned model (LoRA)...")
fine_tuned_model_name = "phi_lora_commonsense_qa"
# fine_tuned_model_name = "phi_lora_social_qa"

finetune_model, finetune_tokenizer = FastLanguageModel.from_pretrained(
    model_name=fine_tuned_model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    device_map="auto"
)
FastLanguageModel.for_inference(finetune_model)
finetune_model.to(device)



Loading dataset: tau/commonsense_qa [validation]

Loading fine-tuned model (LoRA)...
==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.48.0.
   \\   /|    GPU: NVIDIA A40. Max memory: 44.352 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
print("\nCreating Df with a fine-tuned model...")
logic_df = create_logic_df(finetune_model, finetune_tokenizer, validation_data, device="cuda", model_name="Fine-Tuned Model")
logic_df


In [None]:
logic_df.to_csv("csvs/phi_logical_tuned_logic_q.csv")

In [None]:
print("\nCreating Df with a fine-tuned model...")
df_psych = pd.read_csv("csvs/phi_social_tuned_logic_q.csv")
logic_df_with_psych_knowledge = create_logic_df_with_psych_knowledge(finetune_model, finetune_tokenizer, validation_data, df_psych, device="cuda", model_name="Fine-Tuned Model")
logic_df_with_psych_knowledge.to_csv("csvs/phi_logical_tuned_logic_q_with_psych.csv")

In [None]:
# import argparse
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset

def main():
    # Load test data
    validation_data = load_dataset_validation("tau/commonsense_qa", "validation")
    validation_data = validation_data[:100]

    device = "cuda" if torch.cuda.is_available() else "cpu"
    max_seq_length = 2048
    dtype = None
    load_in_4bit = True
##################
    # print("Loading original model...")
    # original_model_name = "unsloth/Phi-4"
    # original_model, original_tokenizer = load_model(
    #     model_name=original_model_name,
    #     max_seq_length=max_seq_length,
    #     load_in_4bit=load_in_4bit,
    #     device=device,
    # )
    # FastLanguageModel.for_inference(original_model)
    # original_model.to(device)

    # print("\nEvaluating original model...")
    # evaluate_model(original_model, original_tokenizer, validation_data, device=device, model_name="Original Model")
#########################
    # ----------------------------------------------------------------------
    # 2) Load the FINE-TUNED model
    # ----------------------------------------------------------------------

    print("\nLoading fine-tuned model (LoRA)...")
    # fine_tuned_model_name = "phi_lora_commonsense_qa"
    fine_tuned_model_name = "phi_lora_social_qa"
    
    finetune_model, finetune_tokenizer = FastLanguageModel.from_pretrained(
        model_name=fine_tuned_model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    FastLanguageModel.for_inference(finetune_model)
    finetune_model.to(device)

    print("\nEvaluating fine-tuned model...")
    evaluate_model(finetune_model, finetune_tokenizer, validation_data, device=device, model_name="Fine-Tuned Model")
########################
if __name__ == "__main__":
    main()

In [None]:
# import argparse
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset


# Load test data
validation_data = load_dataset_validation("tau/commonsense_qa", "validation")
validation_data = validation_data[:1000]

device = "cuda" if torch.cuda.is_available() else "cpu"
max_seq_length = 500
dtype = None
load_in_4bit = True

 ### Original Model ###

In [None]:
print("Loading original model...")
original_model_name = "unsloth/Phi-4"
original_model, original_tokenizer = load_model(
    model_name=original_model_name,
    max_seq_length=max_seq_length,
    load_in_4bit=load_in_4bit,
    device=device,
)
FastLanguageModel.for_inference(original_model)
original_model.to(device)

print("\nEvaluating original model...")
evaluate_model(original_model, original_tokenizer, validation_data, device=device, model_name="Original Model")

### Fine-tuned Model

In [None]:
print("\nLoading fine-tuned model (LoRA)...")
fine_tuned_model_name = "phi_lora_commonsense_qa"
# fine_tuned_model_name = "phi_lora_social_qa"

finetune_model, finetune_tokenizer = FastLanguageModel.from_pretrained(
    model_name=fine_tuned_model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
FastLanguageModel.for_inference(finetune_model)
finetune_model.to(device)

print("\nEvaluating fine-tuned model...")
evaluate_model(finetune_model, finetune_tokenizer, validation_data, device=device, model_name="Fine-Tuned Model")

### Receive answers from two experts

In [None]:
def evaluate_model_with_specialized_modules(model,  df_psych, df_logic, tokenizer, dataset, device="cuda", model_name="Model"):
    correct = 0
    total = len(dataset['id'])
    invalid_responses = 0
    for i in range(total):
        question = dataset['question'][i]
        choices = dataset['choices'][i]['text']
        labels = dataset['choices'][i]['label']
        options_str = "\n".join(f"{label}: {text}" for label, text in zip(labels, choices))
        true_answer = dataset['answerKey'][i]

        psych_exp_answer = df_psych.iloc[i]["answer"]
        logical_exp_answer = df_logic.iloc[i]["answer"]

        prompt = (
        "### Instruction:\n"
        "Answer the multiple-choice question below based on the provided context and the thoughts of two experts."
        "Your response has to be A, B, C, D, or E\n\n"
        f"### Question:\n{question}\n\n"
        f"### Choices:\n"
        f"{options_str}\n\n"
        f"### Thoughts of experts:\n"
        f"An expert in Psychological Understanding thinks the answer is {psych_exp_answer}.\n"
        f"An expert in Logical Reasoning thinks the answer is {logical_exp_answer}.\n\n"
        f"### Response:\n"
    )
        
        inputs = tokenizer([prompt], return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=64,
                use_cache=True
            )

        # Decode
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

        if "### Response:" in decoded:
            model_response = decoded.split("### Response:")[-1].strip()
        else:
            model_response = decoded
        # print(f"Model response: {model_response}")
        predicted_letter = get_letter_from_response(model_response)

        if predicted_letter is None:
            invalid_responses += 1
        elif predicted_letter == true_answer:
            correct += 1
        # print(f"predicted answer is : {predicted_letter}, true_answer: {true_answer}")

    accuracy = correct / total
    print(f"[{model_name}] Accuracy: {accuracy:.2%}  ({correct}/{total})")
    print(f"[{model_name}] Invalid Responses: {invalid_responses}/{total} ({invalid_responses/total:.2%})")
    # print(f"[{model_name}] Accuracy: {accuracy:.2%}  ({correct}/{total})")
    return accuracy


In [None]:
print("Loading original model...")
original_model_name = "unsloth/Phi-4"
original_model, original_tokenizer = load_model(
    model_name=original_model_name,
    max_seq_length=max_seq_length,
    load_in_4bit=load_in_4bit,
    device=device,
)
FastLanguageModel.for_inference(original_model)
original_model.to(device)

In [None]:
print("\nEvaluating original model...")
# evaluate_model(original_model, original_tokenizer, validation_data, device=device, model_name="Original Model")
df_psych = pd.read_csv("csvs/phi_social_tuned_logic_q.csv")
df_logic = pd.read_csv("csvs/phi_logical_tuned_logic_q.csv")
evaluate_model_with_specialized_modules(original_model, df_psych, df_logic, original_tokenizer, validation_data, device="cuda", model_name="Model")

## architecture 2

In [None]:
def evaluate_model_with_specialized_modules(model,  df_psych, df_logic, tokenizer, dataset, device="cuda", model_name="Model"):
    correct = 0
    total = len(dataset['id'])
    invalid_responses = 0
    for i in range(total):
        question = dataset['question'][i]
        choices = dataset['choices'][i]['text']
        labels = dataset['choices'][i]['label']
        options_str = "\n".join(f"{label}: {text}" for label, text in zip(labels, choices))
        true_answer = dataset['answerKey'][i]

        psych_exp_answer = df_psych.iloc[i]["answer"]
        logical_exp_answer = df_logic.iloc[i]["answer"]

        prompt = (
        "### Instruction:\n"
        "Answer the multiple-choice question below based on the provided context and the thoughts of two experts."
        "Think about the type of task. It can be logical or psychological. Then learn from its type's expert model."
        "Your response has to be A, B, C, D, or E\n\n"
        f"### Question:\n{question}\n\n"
        f"### Choices:\n"
        f"{options_str}\n\n"
        f"### Thoughts of experts:\n"
        f"An expert in Psychological Understanding thinks the answer is {psych_exp_answer}.\n"
        f"An expert in Logical Reasoning thinks the answer is {logical_exp_answer}.\n\n"
        f"### Response:\n"
    )
        
        inputs = tokenizer([prompt], return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=64,
                use_cache=True
            )

        # Decode
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

        if "### Response:" in decoded:
            model_response = decoded.split("### Response:")[-1].strip()
        else:
            model_response = decoded
        # print(f"Model response: {model_response}")
        predicted_letter = get_letter_from_response(model_response)

        if predicted_letter is None:
            invalid_responses += 1
        elif predicted_letter == true_answer:
            correct += 1
        # print(f"predicted answer is : {predicted_letter}, true_answer: {true_answer}")

    accuracy = correct / total
    print(f"[{model_name}] Accuracy: {accuracy:.2%}  ({correct}/{total})")
    print(f"[{model_name}] Invalid Responses: {invalid_responses}/{total} ({invalid_responses/total:.2%})")
    # print(f"[{model_name}] Accuracy: {accuracy:.2%}  ({correct}/{total})")
    return accuracy


In [None]:
print("Loading original model...")
original_model_name = "unsloth/Phi-4"
original_model, original_tokenizer = load_model(
    model_name=original_model_name,
    max_seq_length=max_seq_length,
    load_in_4bit=load_in_4bit,
    device=device,
)
FastLanguageModel.for_inference(original_model)
original_model.to(device)

In [None]:
print("\nEvaluating original model...")
# evaluate_model(original_model, original_tokenizer, validation_data, device=device, model_name="Original Model")
df_psych = pd.read_csv("csvs/phi_social_tuned_logic_q.csv")
df_logic = pd.read_csv("csvs/phi_logical_tuned_logic_q.csv")
evaluate_model_with_specialized_modules(original_model, df_psych, df_logic, original_tokenizer, validation_data, device="cuda", model_name="Model")