In [None]:
from datasets import load_dataset
import torch
import pandas as pd
import torch
import random
import numpy as np
import re
import torch
from datasets import load_dataset
from unsloth import FastLanguageModel

# Fix all seeds
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed_all(seed)
pd.options.display.max_colwidth = None


In [None]:
socialiqa = load_dataset("allenai/social_i_qa", trust_remote_code=True)

In [None]:

# Updated prompt including context and three answer choices
chat_prompt = """
### Instruction:
{}

### Context:
{}

### Question:
{}

### Choices:
A: {}
B: {}
C: {}

### Response:
{}"""

def load_dataset_validation(dataset_name, split):
    """Load the validation split of a dataset."""
    print(f"Loading dataset: {dataset_name} [{split}]")
    dataset = load_dataset(dataset_name, split=split)
    return dataset

def load_model(model_name, max_seq_length=2048, dtype=None, load_in_4bit=True, device="cuda"):
    """Load a model and prepare it for inference."""
    print(f"Loading model: {model_name}")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    FastLanguageModel.for_inference(model)
    model.to(device)
    return model, tokenizer

def get_letter_from_response(text):
    # Look for an isolated letter A-C (ignore D and E as SocialIQA has three options)
    match = re.search(r"\b[ABC]\b", text, re.IGNORECASE)
    if match:
        return match.group(0).upper()
    return None

def get_true_answer(label):
    # Convert the string label to int and map to a letter
    label_idx = int(label)
    mapping = {1: 'A', 2: 'B', 3: 'C'}
    return mapping.get(label_idx, None)

def evaluate_model(model, tokenizer, dataset, device="cuda", model_name="Model"):
    correct = 0
    total = len(dataset)
    invalid_responses = 0
    instruction = "Answer the multiple-choice question below based on the provided context. Be emphathetic. Your response must be a single letter: A, B, or C."
    
    for i in range(total):
        context = dataset[i]['context']
        question = dataset[i]['question']
        answerA = dataset[i]['answerA']
        answerB = dataset[i]['answerB']
        answerC = dataset[i]['answerC']
        true_answer = get_true_answer(dataset[i]['label'])

        prompt = chat_prompt.format(instruction, context, question, answerA, answerB, answerC, "")
        inputs = tokenizer([prompt], return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=64,
                use_cache=True
            )

        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

        if "### Response:" in decoded:
            model_response = decoded.split("### Response:")[-1].strip()
        else:
            model_response = decoded.strip()
            
        predicted_letter = get_letter_from_response(model_response)

        if predicted_letter is None:
            invalid_responses += 1
        elif predicted_letter == true_answer:
            correct += 1
        # print(f"model response {model_response}")
        # print(f"predicted_letter: {predicted_letter}")
        # print(f"true_answer: {true_answer}")
    accuracy = correct / total
    print(f"[{model_name}] Accuracy: {accuracy:.2%} ({correct}/{total})")
    print(f"[{model_name}] Invalid Responses: {invalid_responses}/{total} ({invalid_responses/total:.2%})")
    return accuracy


def create_psychological_df(model, tokenizer, dataset, device="cuda", model_name="Model"):
    results = []
    
    instruction = "Answer the multiple-choice question below based on the provided context. Be emphathetic. Your response must be a single letter: A, B, or C."
    
    
    for i in range(len(dataset)):
        context = dataset[i]['context']
        question = dataset[i]['question']
        answerA = dataset[i]['answerA']
        answerB = dataset[i]['answerB']
        answerC = dataset[i]['answerC']
        
        prompt = chat_prompt.format(instruction, context, question, answerA, answerB, answerC, "")
        inputs = tokenizer([prompt], return_tensors="pt").to(device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,  # Increased to capture reasoning
                use_cache=True
            )
        
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
        
        if "### Response:" in decoded:
            model_response = decoded.split("### Response:")[-1].strip()
        else:
            model_response = decoded.strip()
            
        predicted_letter = get_letter_from_response(model_response)


        # Extract predicted letter and reasoning
        # predicted_letter = get_letter_from_response(decoded)
        reasoning = decoded if predicted_letter is None else decoded.replace(predicted_letter, "").strip()
        
        # Append results
        results.append({
            "id":i,
            "context": context,
            "question": question,
            "answer": predicted_letter
        })
    
    # Create DataFrame
    df = pd.DataFrame(results)
    return df




In [None]:

# Updated prompt including context and three answer choices
chat_prompt = """
### Instruction:
{}

### Context:
{}

### Question:
{}

### Choices:
A: {}
B: {}
C: {}

### Response:
{}"""

def load_dataset_validation(dataset_name, split):
    """Load the validation split of a dataset."""
    print(f"Loading dataset: {dataset_name} [{split}]")
    dataset = load_dataset(dataset_name, split=split)
    return dataset

def load_model(model_name, max_seq_length=2048, dtype=None, load_in_4bit=True, device="cuda"):
    """Load a model and prepare it for inference."""
    print(f"Loading model: {model_name}")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    FastLanguageModel.for_inference(model)
    model.to(device)
    return model, tokenizer

def get_letter_from_response(text):
    # Look for an isolated letter A-C (ignore D and E as SocialIQA has three options)
    match = re.search(r"\b[ABC]\b", text, re.IGNORECASE)
    if match:
        return match.group(0).upper()
    return None

def get_true_answer(label):
    # Convert the string label to int and map to a letter
    label_idx = int(label)
    mapping = {1: 'A', 2: 'B', 3: 'C'}
    return mapping.get(label_idx, None)

def evaluate_model(model, tokenizer, dataset, device="cuda", model_name="Model"):
    correct = 0
    total = len(dataset)
    invalid_responses = 0
    instruction = "Answer the multiple-choice question below based on the provided context. Be emphathetic. Your response must be a single letter: A, B, or C."
    
    for i in range(total):
        context = dataset[i]['context']
        question = dataset[i]['question']
        answerA = dataset[i]['answerA']
        answerB = dataset[i]['answerB']
        answerC = dataset[i]['answerC']
        true_answer = get_true_answer(dataset[i]['label'])

        prompt = chat_prompt.format(instruction, context, question, answerA, answerB, answerC, "")
        inputs = tokenizer([prompt], return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=64,
                use_cache=True
            )

        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

        if "### Response:" in decoded:
            model_response = decoded.split("### Response:")[-1].strip()
        else:
            model_response = decoded.strip()
            
        predicted_letter = get_letter_from_response(model_response)

        if predicted_letter is None:
            invalid_responses += 1
        elif predicted_letter == true_answer:
            correct += 1
        # print(f"model response {model_response}")
        # print(f"predicted_letter: {predicted_letter}")
        # print(f"true_answer: {true_answer}")
    accuracy = correct / total
    print(f"[{model_name}] Accuracy: {accuracy:.2%} ({correct}/{total})")
    print(f"[{model_name}] Invalid Responses: {invalid_responses}/{total} ({invalid_responses/total:.2%})")
    return accuracy


In [None]:
def create_psychological_df_with_logical_knowledge(model, tokenizer, dataset, df_logic, device="cuda", model_name="Model"):
    results = []
    for i in range(len(dataset)):
        context = dataset[i]['context']
        question = dataset[i]['question']
        answerA = dataset[i]['answerA']
        answerB = dataset[i]['answerB']
        answerC = dataset[i]['answerC']
        logical_exp_answer = df_logic.iloc[i]["answer"]
        prompt = (
        "### Instruction:\n"
        "Answer the following multiple-choice question based on the context "
        "Choose only one letter: A, B, or C. Do not explain your answer.\n\n"
        f"### Context:\n{context}\n\n"
        f"### Question:\n{question}\n\n"
        f"### Choices:\n"
        f"A: {answerA}\n"
        f"B: {answerB}\n"
        f"C: {answerC}\n\n"
        f"### Thoughts of logical expert:\n"
        f"An expert in Logical Reasoning thinks the answer is {logical_exp_answer}.\n\n"
        f"### Response:\n"
    )
        # prompt = chat_prompt.format(instruction, context, question, answerA, answerB, answerC, "")
        inputs = tokenizer([prompt], return_tensors="pt").to(device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,  # Increased to capture reasoning
                use_cache=True
            )
        
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
        
        if "### Response:" in decoded:
            model_response = decoded.split("### Response:")[-1].strip()
        else:
            model_response = decoded.strip()
            
        predicted_letter = get_letter_from_response(model_response)


        # Extract predicted letter and reasoning
        # predicted_letter = get_letter_from_response(decoded)
        reasoning = decoded if predicted_letter is None else decoded.replace(predicted_letter, "").strip()
        
        # Append results
        results.append({
            "id":i,
            "context": context,
            "question": question,
            "answer": predicted_letter
        })
    
    # Create DataFrame
    df = pd.DataFrame(results)
    return df




In [None]:
# import argparse
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
# Load test data
validation_data = load_dataset_validation("allenai/social_i_qa", "validation")
# print("validation data 1: ",validation_data)
validation_data = validation_data.select(range(1000))
# print("validation data 2: ",validation_data)
device = "cuda" if torch.cuda.is_available() else "cpu"
max_seq_length = 2048
dtype = None
load_in_4bit = True


print("\nLoading fine-tuned model (LoRA)...")
fine_tuned_model_name = "phi_lora_social_qa"
# fine_tuned_model_name = "phi_lora_commonsense_qa"

finetune_model, finetune_tokenizer = FastLanguageModel.from_pretrained(
    model_name=fine_tuned_model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
FastLanguageModel.for_inference(finetune_model)
finetune_model.to(device)



In [None]:
print("\Creating psychological df ...")
psych_df = create_psychological_df(finetune_model, finetune_tokenizer, validation_data, device="cuda", model_name="Fine-Tuned Model")
########################


In [None]:
psych_df.to_csv("csvs/phi_social_tuned_with_psych_q.csv")

In [None]:
print("\Creating psychological df ...")
psych_df_with_logical_knowledge = create_psychological_df_with_logical_knowledge(finetune_model, finetune_tokenizer, validation_data, device="cuda", model_name="Fine-Tuned Model")

########################


In [None]:
psych_df_with_logical_knowledge.to_csv("csvs/phi_social_tuned_psych_with_logic.csv")

In [None]:
# import argparse
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset


# Load test data
validation_data = load_dataset_validation("allenai/social_i_qa", "validation")
# print("validation data 1: ",validation_data)

validation_data = validation_data.select(range(1000))
# print("validation data 2: ",validation_data)

device = "cuda" if torch.cuda.is_available() else "cpu"
max_seq_length = 64
dtype = None
load_in_4bit = True


### Original Model

In [None]:
print("Loading original model...")
original_model_name = "unsloth/Phi-4"
original_model, original_tokenizer = load_model(
    model_name=original_model_name,
    max_seq_length=max_seq_length,
    load_in_4bit=load_in_4bit,
    device=device,
)
FastLanguageModel.for_inference(original_model)
original_model.to(device)

print("\nEvaluating original model...")
evaluate_model(original_model, original_tokenizer, validation_data, device=device, model_name="Original Model")

### Fine-tuned model

In [None]:
print("\nLoading fine-tuned model (LoRA)...")
# fine_tuned_model_name = "phi_lora_social_qa"
fine_tuned_model_name = "phi_lora_commonsense_qa"

finetune_model, finetune_tokenizer = FastLanguageModel.from_pretrained(
    model_name=fine_tuned_model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
FastLanguageModel.for_inference(finetune_model)
finetune_model.to(device)

print("\nEvaluating fine-tuned model...")
evaluate_model(finetune_model, finetune_tokenizer, validation_data, device=device, model_name="Fine-Tuned Model")

### Receive answers from two experts

In [None]:
def evaluate_model_with_specialized_modules(model, df_psych, df_logic, tokenizer, dataset, device="cuda", model_name="Model"):
    correct = 0
    total = len(dataset)
    invalid_responses = 0
    for i in range(total):
        context = dataset[i]['context']
        question = dataset[i]['question']
        answerA = dataset[i]['answerA']
        answerB = dataset[i]['answerB']
        answerC = dataset[i]['answerC']
        true_answer = get_true_answer(dataset[i]['label'])
        psych_exp_answer = df_psych.iloc[i]["answer"]
        logical_exp_answer = df_logic.iloc[i]["answer"]
        prompt = (
        "### Instruction:\n"
        "Answer the following multiple-choice question based on the context and the thoughts of two experts. "
        "Choose only one letter: A, B, or C. Do not explain your answer.\n\n"
        f"### Context:\n{context}\n\n"
        f"### Question:\n{question}\n\n"
        f"### Choices:\n"
        f"A: {answerA}\n"
        f"B: {answerB}\n"
        f"C: {answerC}\n\n"
        f"### Thoughts of experts:\n"
        f"An expert in Psychological Understanding thinks the answer is {psych_exp_answer}.\n"
        f"An expert in Logical Reasoning thinks the answer is {logical_exp_answer}.\n\n"
        f"### Response:\n"
    )
        inputs = tokenizer([prompt], return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=10,
                use_cache=True
            )

        decoded = tokenizer.decode(outputs[0])
        if "### Response:" in decoded:
            model_response = decoded.split("### Response:")[1].strip()
        else:
            model_response = decoded.strip()
        # print(f"Model response is {model_response}")
            
        predicted_letter = get_letter_from_response(model_response)

        if predicted_letter is None:
            invalid_responses += 1
        elif predicted_letter == true_answer:
            correct += 1
        # print(f"model response {model_response}")
        # print(f"predicted_letter: {predicted_letter}")
        # print(f"true_answer: {true_answer}")
    accuracy = correct / total
    print(f"[{model_name}] Accuracy: {accuracy:.2%} ({correct}/{total})")
    print(f"[{model_name}] Invalid Responses: {invalid_responses}/{total} ({invalid_responses/total:.2%})")
    return accuracy

In [None]:
print("Loading original model...")
original_model_name = "unsloth/Phi-4"
original_model, original_tokenizer = load_model(
    model_name=original_model_name,
    max_seq_length=max_seq_length,
    load_in_4bit=load_in_4bit,
    device=device,
)
FastLanguageModel.for_inference(original_model)
original_model.to(device)

In [None]:
print("\nEvaluating original model...")
# evaluate_model(original_model, original_tokenizer, validation_data, device=device, model_name="Original Model")

df_psych = pd.read_csv("csvs/phi_social_tuned_with_psych_q.csv")
df_logic = pd.read_csv("csvs/phi_common_tuned_with_psych_q.csv")
evaluate_model_with_specialized_modules(original_model, df_psych, df_logic, original_tokenizer, validation_data, device="cuda", model_name="Model")