In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TorchAoConfig

MODEL_NAME = "microsoft/phi-4"
DEVICE = torch.device("cpu")
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
if torch.mps.is_available():
    DEVICE = torch.device("mps")

print(f"Using device: {DEVICE}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# quantization_config = TorchAoConfig("int4_weight_only", group_size=32)
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_NAME,
#     low_cpu_mem_usage=True,
#     torch_dtype=torch.bfloat16,
#     quantization_config=quantization_config,
#     device_map=DEVICE,
# )
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)

Using device: mps


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [12]:
print(model.get_memory_footprint() / 10**9)
print(next(model.parameters()).device)

29.319014656
mps:0


In [23]:
import os, sys

dir2 = os.path.abspath("")
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

import pandas as pd
import ast
import csv

import utils.prompt as prompt
import complexity_estimation.tokenwise_entropy as tokenwise_entropy

import importlib

# Required to purge the module cache and use the latest version after an update
importlib.reload(prompt)
importlib.reload(tokenwise_entropy)


def estimate_dataset(
    df,
    model,
    tokenizer,
    get_subject_from_row,
    get_question_from_row,
    get_options_from_row,
    verify_answer,
):
    df["entropy_ans_correct"] = False
    df["entropy_ans_output"] = ""
    df["entropy_value"] = 0.0
    entropy_estimator = tokenwise_entropy.TokenwiseEntropy(llm_model=model, device=DEVICE)

    for index, row in df.iterrows():
        sys_prompt = prompt.get_sys_prompt(get_subject_from_row(row))
        user_prompt = prompt.get_user_prompt(get_question_from_row(row), get_options_from_row(row))
        messages = [
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": user_prompt},
        ]
        formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE)

        outputs = model.generate(**inputs, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id)

        entropy = entropy_estimator.calculate(outputs)
        df.at[index, "entropy_value"] = entropy

        input_length = inputs.input_ids.shape[1]
        answer_raw = outputs[0, input_length:]
        answer = tokenizer.decode(answer_raw, skip_special_tokens=True)
        df.at[index, "entropy_ans_output"] = answer

        df.at[index, "entropy_ans_correct"] = verify_answer(row, answer)
        # print(f"Answer: {answer}\nEntropy: {entropy}\nis_correct: {df.at[index, "entropy_ans_correct"]}\n\n")

    return df


df = pd.read_csv("../data/mmlu_pro_stem.tsv", sep="\t", header=0)
# df = df.head(10)


def verify_model_answer(row, model_answer):
    try:
        return int(row["answer_index"]) + 1 == int(model_answer)
    except:
        return False


processed_df = estimate_dataset(
    df=df,
    model=model,
    tokenizer=tokenizer,
    get_subject_from_row=lambda row: row["base_cluster"],
    get_question_from_row=lambda row: row["question"],
    get_options_from_row=lambda row: ast.literal_eval(row["options"]),
    verify_answer=verify_model_answer,
)
processed_df.to_csv("../data/mmlu_pro_stem_w_entropy.tsv", sep="\t", quoting=csv.QUOTE_NONE)

Answer: 3
Entropy: 0.0791015625
is_correct: True


Answer: 2
Entropy: 1.03125
is_correct: False


Answer: 1
Entropy: 0.0084228515625
is_correct: False


Answer: 7
Entropy: 0.10546875
is_correct: False




KeyboardInterrupt: 