In [1]:
!nvidia-smi

Tue Feb 25 06:30:03 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  | 00000000:9E:00.0 Off |                    0 |
| N/A   34C    P0              73W / 400W |  30006MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

Introduces QA complexity estimation as a novel metric for optimizing LLM fine-tuning.

In [2]:
import os
import ast
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from sklearn.utils import shuffle

import transformers

In [3]:
DATA_PATH = Path("data")
VALID_PART = 0.2

In [4]:
data_name = "mmlu_pro_stem.tsv"
data_path = os.path.join(DATA_PATH, data_name)

df = pd.read_csv(data_path, sep="\t")
# df = shuffle(df)
df["options"] = df["options"].apply(ast.literal_eval)
df["answer_index"] = df["answer_index"].apply(lambda x: str(x + 1))

def enumerate_question_and_options(line):
    enumerated_variants = "\n".join(
        f"{i + 1}) {option}" for i, option in enumerate(line["options"])
    )
    return f"{line['question']}\n\n{enumerated_variants}"

df["question_with_variants"] = df.apply(enumerate_question_and_options, axis=1)

# train_length = int((1 - VALID_PART) * df.shape[0])
# df_train = df.iloc[:train_length].reset_index(drop=True)
# df_valid = df.iloc[train_length:].reset_index(drop=True)

# print(df_train.shape[0], df_valid.shape[0])
# df_train

In [5]:
idx = 0
print(df.iloc[idx].question_with_variants)

Which of the following criticisms of Llewellyn's distinction between the grand and formal styles of legal reasoning is the most compelling?

1) There is no distinction between the two forms of legal reasoning.
2) Judges are appointed to interpret the law, not to make it.
3) It is misleading to pigeon-hole judges in this way.
4) Judicial reasoning is always formal.


### Answer generation

In [6]:
# pipeline = transformers.pipeline(
#     "text-generation",
#     model="microsoft/phi-4",
#     model_kwargs={"torch_dtype": "auto"},
#     device_map="auto",
# )

In [7]:
# llm_answers = list()
# for idx in tqdm(range(df.shape[0])):
#     row = df.iloc[idx]

#     system_prompt = f"You are an expert in the field of {row['category']}. Answer the questions."
#     prompt = "Choose one of the answers. Write down ONLY the NUMBER of the correct answer and nothing else."
    
#     request = (
#         prompt
#         + "\n\n"
#         + row["question_with_variants"]
#     )
#     messages = [
#         {"role": "system", "content": system_prompt},
#         {"role": "user", "content": request},
#     ]
    
#     outputs = pipeline(messages, max_new_tokens=1)
#     llm_answers.append(outputs[0]["generated_text"][-1]["content"])

In [8]:
# df["llm_answers"] = llm_answers
# df

In [9]:
# sum(df["answer_index"] == df["llm_answers"]) / df["answer_index"].shape[0]

In [10]:
# df.to_csv("saved_mmlu_pro_predictions.tsv", sep="\t", index=False)

In [11]:
# raise ValueError

### Embeddings

In [12]:
pipeline = transformers.pipeline(
    "feature-extraction",
    model="microsoft/phi-4",
    model_kwargs={"torch_dtype": "auto"},
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Device set to use cuda:0


In [13]:
tokenizer = transformers.AutoTokenizer.from_pretrained("microsoft/phi-4")

In [None]:
embedding_list = list()
for idx in tqdm(range(df.shape[0])):
    row = df.iloc[idx]
    
    system_prompt = f"You are an expert in the field of {row['category']}. Answer the questions."
    prompt = "Choose one of the answers. Write down ONLY the NUMBER of the correct answer and nothing else."
    
    request = (
        prompt
        + "\n\n"
        + row["question_with_variants"]
    )
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": request},
    ]
    
    formatted_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    token_embeddings = pipeline(formatted_prompt)
    
    sentence_embedding = np.mean(token_embeddings[0], axis=0)

    embedding_list.append(sentence_embedding)

  0%|          | 9/12032 [00:01<23:11,  8.64it/s]  You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
  2%|▏         | 255/12032 [00:30<17:22, 11.30it/s]

In [None]:
embedding_array = np.array(embedding_list)

In [None]:
np.save("embedding_array.npy", embedding_array)

In [None]:
embedding_array.shape

In [None]:
1