# Setup

## Imports

In [1]:
import pyarrow.parquet as pa
import pandas as pd
import re
from openai import OpenAI
from tqdm import tqdm

## Read file

In [2]:
table = pa.read_table('test-00000-of-00001.parquet')  # We dind't donwload the dataset using datasets library because it caused an exception

In [3]:
df = table.to_pandas()
df = df.set_index('question_id')
df.head()

Unnamed: 0_level_0,question,options,answer,answer_index,cot_content,category,src
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
70,"Typical advertising regulatory bodies suggest,...","[Safe practices, Fear, Jealousy, Trivial, Unsa...",I,8,,business,ori_mmlu-business_ethics
71,Managers are entrusted to run the company in t...,"[Shareholders, Diligence, Self-interest, Share...",F,5,,business,ori_mmlu-business_ethics
72,There are two main issues associated with ____...,"[Down, Autonomy, Remuneration, Benefit, Down, ...",J,9,,business,ori_mmlu-business_ethics
73,_______ locate morality beyond the sphere of r...,"[Ethical egoism, Ethics of duty, Postmodern et...",C,2,,business,ori_mmlu-business_ethics
74,Some of key differences between Islamic finan...,"[Interest, Certain, Assured, Both tangible and...",G,6,,business,ori_mmlu-business_ethics


In [4]:
NUM_SAMPLES = 600

# Ensure reproducibility setting the seed 
df = df.sample(NUM_SAMPLES, random_state=0)


In [5]:
df

Unnamed: 0_level_0,question,options,answer,answer_index,cot_content,category,src
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
8603,Compute the area of the triangle whose altitud...,"[56\sqrt{15}, 60\sqrt{7}, 240\sqrt{7}/7, 48\sq...",C,2,,math,ori_mmlu-high_school_mathematics
1663,What is the relationship between the formal so...,"[Treaties supersede custom, Treaties and Gener...",J,9,,law,ori_mmlu-international_law
7113,The elasticity of supply is typically greater ...,"[producers have less capital to invest., produ...",G,6,,economics,ori_mmlu-high_school_microeconomics
6220,Which of the following conditions is a peroxis...,"[Zellweger syndrome, Maple syrup urine disease...",A,0,,health,ori_mmlu-medical_genetics
10472,"Define the following terms, making clear the d...","[Byte is a collection of 4 bits, Bit is the sm...",J,9,,computer science,stemez-ComputerScience
...,...,...,...,...,...,...,...
2635,Piaget’s theory describes stages of cognitive ...,"[During adolescence only, From birth to late c...",J,9,,psychology,ori_mmlu-professional_psychology
3404,Assume a population of garden peas in genetic ...,"[0.54FF + 0.12Ff + 0.34ff, 0.42FF + 0.36Ff + 0...",F,5,,biology,stemez-Genetics
5034,"Which culture, previously-known as the temple-...","[Hohokam, Folsom, Adena, Ancestral Puebloan, M...",E,4,,history,ori_mmlu-prehistory
10380,Which of the following is NOT a reasonable jus...,[There is no other work for the processor to d...,C,2,,computer science,ori_mmlu-college_computer_science


## Prompts & Config

In [None]:
MARTIAN_API_KEY = "YOUR_API_KEY_HERE"

In [8]:
answers_selected = "two" # Number of options the model must choose in its first respose


SELECT_ANSWERS_PROMPT = """
You will be given a multiple-choice question. Your task is to select the {answers_selected} most likely correct answers from the list of options, based on reasoning and evidence.

For each selected option:
- Provide the number of the answer.
- Write a concise justification explaining why it is a strong candidate.

⚠️ IMPORTANT:
- Do NOT select more than {answers_selected} options.
- Do NOT include any information or analysis about options you do NOT select.
- Do NOT include extra commentary or headers.
- Follow the format *exactly* as shown below.

QUESTION:
{question}
OPTIONS:
{options_formatted}

====================
✅ EXPECTED ANSWER FORMAT:
Option: 0
Justification: This option directly addresses the main point of the question and aligns with known facts or logical reasoning.

Option: 3
Justification: This option is also supported by relevant evidence and complements the reasoning in Option 1.
====================
"""


SELECT_BEST_OPTION_PROMPT = f"""
Now, based on your justifications and the {answers_selected} best options previously selected, choose the SINGLE most likely correct answer.

Provide:
- The number of the best option
- A short reason why this option is stronger than the other(s)
- An estimated probability (between 0 and 1) that this answer is correct

⚠️ IMPORTANT:
- Do NOT include general reflections or unrelated commentary.
- Do NOT reference options that were not selected in the prior step.
- Use only the format below—no extra content or markdown.

====================
✅ EXPECTED ANSWER FORMAT:
Final Answer: 2
Reason: Option 2 covers the key concepts more completely and directly than Option 1.
Confidence: 0.85
====================
"""



JUDGE_COMPARISON_PROMPT = """
You are an expert evaluator reviewing responses from multiple AI models that have answered the same multiple-choice question.

Each model provided:
1. {answers_selected} top candidate answers with justifications.
2. A final selected answer and a confidence score.
3. Supporting reasoning for that final answer.

Your tasks:
- Evaluate the correctness and depth of each model's justifications.
- Assess the logic behind the final answer and the confidence score.
- Review all model responses to determine the correct answer to the question.
- Identify the correct answer to the question and provide a justification.
- Estimate the probability that your selected answer is correct.

CRITICAL INSTRUCTIONS:
⚠️ STRICTLY follow the format below.
⚠️ Do NOT add extra commentary, explanations, introductions, or conclusions.
⚠️ Do NOT summarize the question or repeat content from the prompt.
⚠️ Only write what is required under each section.

Criteria to consider:
- Accuracy of the final answer
- Quality and depth of justifications
- Logical coherence of reasoning
- Clarity and precision
- Appropriateness of the confidence estimate
Please review the question and responses below:
============================
QUESTION:
{question}
OPTIONS:
{options_formatted}

============================
MODEL RESPONSES:
{model_responses}

====================
✅ EXPECTED ANSWER FORMAT:
Evaluation Summary:
Model 0: Strong justifications and coherent logic, though the confidence score was slightly too low given the quality of reasoning.
Model 1: Final answer was incorrect due to errors in the reasoning.

Correct Answer: 1
Justification: Option 1 aligns most closely with the core reasoning required by the question, and is well supported by evidence in the selected justifications.
Judge Confidence: 0.88
====================
"""

In [9]:
# Parse the responses to the prompts
def parse_select_best_option(response):
    pattern = (
        r"Final Answer:\s*(\d+)\s*"
        r"Reason:\s*(.*?)\s*"
        r"Confidence:\s*(0(?:\.\d+)?|1(?:\.0+)?)"
    )
    match = re.search(pattern, response, re.DOTALL)
    if not match:
        raise ValueError("Response does not match the expected format.")

    option, reason, confidence = match.groups()
    return option, reason, confidence


def parse_judge_evaluation(response):
    pattern = (
        r"Evaluation Summary:\s*(.*?)"
        r"Correct Answer:\s*(\d+)\s*"
        r"Justification:\s*(.*?)"
        r"Judge Confidence:\s*(0(?:\.\d+)?|1(?:\.0+)?)"
    )
    match = re.search(pattern, response, re.DOTALL)
    if not match:
        raise ValueError("Response does not match the expected judge format.")

    summary, correct_answer, justification, confidence = match.groups()
    return summary, correct_answer, justification, confidence


In [10]:
client = OpenAI(
    api_key=MARTIAN_API_KEY,
    base_url="https://withmartian.com/api/openai/v1",
)

# Monolithic baselines

In [11]:
MODEL_LIST = ["openai/openai/gpt-4.1-nano", "anthropic/anthropic/claude-3-5-haiku-latest", "gemini/gemini/gemini-2.0-flash"]

In [12]:
def get_response(model, prompt, message_history):
    message_history=message_history + [
          {
              "role": "user",
              "content": prompt,
          }
      ]

    chat_completion = client.chat.completions.create(
        model = model,
        messages = message_history
    )

    response = chat_completion.choices[0].message.content

    message_history = message_history + [        {
            "role": "assistant",
            "content": response,
        }]

    return message_history, response

In [13]:
# Dictionaries that are going to be used to created the DataFrames storing each model's responses
dicts = {model_name: {'question_id': [], 'question': [], 'options': [], 'category': [], 'correct_answer': [], 'first_prompt':[], 'first_response': [], 'second_response': [],
                     'selected_option': [], 'reason': [], 'confidence': []} for model_name in MODEL_LIST}
dicts

{'openai/openai/gpt-4.1-nano': {'question_id': [],
  'question': [],
  'options': [],
  'category': [],
  'correct_answer': [],
  'first_prompt': [],
  'first_response': [],
  'second_response': [],
  'selected_option': [],
  'reason': [],
  'confidence': []},
 'anthropic/anthropic/claude-3-5-haiku-latest': {'question_id': [],
  'question': [],
  'options': [],
  'category': [],
  'correct_answer': [],
  'first_prompt': [],
  'first_response': [],
  'second_response': [],
  'selected_option': [],
  'reason': [],
  'confidence': []},
 'gemini/gemini/gemini-2.0-flash': {'question_id': [],
  'question': [],
  'options': [],
  'category': [],
  'correct_answer': [],
  'first_prompt': [],
  'first_response': [],
  'second_response': [],
  'selected_option': [],
  'reason': [],
  'confidence': []}}

In [14]:
def format_options(options):
  return "\n".join(f"{i}. {option}" for i, option in enumerate(options))

In [15]:
for question_id, question in tqdm(df.iterrows(), total=len(df)):
  options_formatted = format_options(question.options)

  for model in MODEL_LIST:
    try:
      first_prompt = SELECT_ANSWERS_PROMPT.format(answers_selected=answers_selected, question=question.question, options_formatted=options_formatted)
      message_history, first_response = get_response(model, prompt = first_prompt, message_history=[])


      _, second_response = get_response(model, prompt = SELECT_BEST_OPTION_PROMPT, message_history=message_history)
      try:
        option, reason, confidence = parse_select_best_option(second_response)
      except Exception:
        print('Error parsing response')
        continue

      # Save the information
      dicts[model]['question'].append(question.question)
      dicts[model]['question_id'].append(question_id)  
      dicts[model]['category'].append(question.category)
      dicts[model]['correct_answer'].append(question.answer_index)
      dicts[model]['first_prompt'].append(first_prompt)
      dicts[model]['first_response'].append(first_response)
      dicts[model]['second_response'].append(second_response)
      dicts[model]['options'].append(options_formatted)
      dicts[model]['selected_option'].append(option)
      dicts[model]['reason'].append(reason)
      dicts[model]['confidence'].append(confidence)


    except Exception as e:
      print(f'Error with model: {model} error: {str(e)}')

 74%|██████████████████████████████████████████████████████████                    | 447/600 [1:24:42<28:16, 11.09s/it]

Error parsing response


100%|██████████████████████████████████████████████████████████████████████████████| 600/600 [1:51:27<00:00, 11.15s/it]


In [61]:
monolothic_dfs = {model:pd.DataFrame(data=model_dict, index=model_dict['question_id']) for model, model_dict in dicts.items()}

In [63]:
monolothic_dfs[MODEL_LIST[0]]

Unnamed: 0,question_id,question,options,category,correct_answer,first_prompt,first_response,second_response,selected_option,reason,confidence
8603,8603,Compute the area of the triangle whose altitud...,0. 56\sqrt{15}\n1. 60\sqrt{7}\n2. 240\sqrt{7}/...,math,2,\nYou will be given a multiple-choice question...,Option: 2 \nJustification: The area can be co...,Final Answer: 2 \nReason: Option 2 aligns pre...,2,Option 2 aligns precisely with the known formu...,0.9
1663,1663,What is the relationship between the formal so...,0. Treaties supersede custom\n1. Treaties and ...,law,9,\nYou will be given a multiple-choice question...,Option: 6\nJustification: This recognizes the ...,Final Answer: 6 \nReason: It acknowledges the...,6,It acknowledges the flexible and case-by-case ...,0.9
7113,7113,The elasticity of supply is typically greater ...,0. producers have less capital to invest.\n1. ...,economics,6,\nYou will be given a multiple-choice question...,Option: 6\nJustification: Producers have more ...,Final Answer: 6 \nReason: Having more time to...,6,Having more time to respond allows producers t...,0.9
6220,6220,Which of the following conditions is a peroxis...,0. Zellweger syndrome\n1. Maple syrup urine di...,health,0,\nYou will be given a multiple-choice question...,Option: 0 \nJustification: Zellweger syndrome...,Final Answer: 0 \nReason: Zellweger syndrome ...,0,Zellweger syndrome is a prototypical peroxisom...,0.95
10472,10472,"Define the following terms, making clear the d...","0. Byte is a collection of 4 bits, Bit is the ...",computer science,9,\nYou will be given a multiple-choice question...,Option: 9\nJustification: This option correctl...,Final Answer: 9 \nReason: It accurately defin...,9,It accurately defines all three terms with sta...,0.90
...,...,...,...,...,...,...,...,...,...,...,...
2635,2635,Piaget’s theory describes stages of cognitive ...,0. During adolescence only\n1. From birth to l...,psychology,9,\nYou will be given a multiple-choice question...,Option: 7\nJustification: Piaget’s theory desc...,Final Answer: 7 \nReason: Option 7 accurately...,7,Option 7 accurately captures the comprehensive...,0.9
3404,3404,Assume a population of garden peas in genetic ...,0. 0.54FF + 0.12Ff + 0.34ff\n1. 0.42FF + 0.36F...,biology,5,\nYou will be given a multiple-choice question...,Option: 1 \nJustification: Starting from init...,Final Answer: 1 \nReason: Option 1 more accur...,1,Option 1 more accurately reflects the expected...,0.85
5034,5034,"Which culture, previously-known as the temple-...",0. Hohokam\n1. Folsom\n2. Adena\n3. Ancestral ...,history,4,\nYou will be given a multiple-choice question...,Option: 3 \nJustification: The Ancestral Pueb...,Final Answer: 4 \nReason: The Mississippian c...,4,The Mississippian culture is more widely recog...,0.9
10380,10380,Which of the following is NOT a reasonable jus...,0. There is no other work for the processor to...,computer science,2,\nYou will be given a multiple-choice question...,Option: 1\nJustification: Busy-waiting is gene...,Final Answer: 3\nReason: Waiting for immediate...,3,Waiting for immediate response upon event comp...,0.90


In [64]:
# Save the CSV files
for model_name, model_df in monolothic_dfs.items():
  model_df.to_csv('monolithic_' + model_name.replace('/', '_') + '.csv')

# Judge-based systems

## Participant mode

In [20]:
# Dictionaries that are going to be used to created the DataFrames storing each model's responses
dicts_jugdes_participants = {model: {'question_id': [], 'question': [], 'options': [], 'category': [], 'correct_answer': [], 'model_responses': [],
                     'judge_prompt': [],'judge_response':[], 'summary':[] ,'selected_option': [], 'reason': [], 'confidence': [],
                                     'judge_original_response':[], 'judge_original_option_selected': [], 'judge_original_confidence': []} for model in MODEL_LIST}
dicts_jugdes_participants

{'openai/openai/gpt-4.1-nano': {'question_id': [],
  'question': [],
  'options': [],
  'category': [],
  'correct_answer': [],
  'model_responses': [],
  'judge_prompt': [],
  'judge_response': [],
  'summary': [],
  'selected_option': [],
  'reason': [],
  'confidence': [],
  'judge_original_response': [],
  'judge_original_option_selected': [],
  'judge_original_confidence': []},
 'anthropic/anthropic/claude-3-5-haiku-latest': {'question_id': [],
  'question': [],
  'options': [],
  'category': [],
  'correct_answer': [],
  'model_responses': [],
  'judge_prompt': [],
  'judge_response': [],
  'summary': [],
  'selected_option': [],
  'reason': [],
  'confidence': [],
  'judge_original_response': [],
  'judge_original_option_selected': [],
  'judge_original_confidence': []},
 'gemini/gemini/gemini-2.0-flash': {'question_id': [],
  'question': [],
  'options': [],
  'category': [],
  'correct_answer': [],
  'model_responses': [],
  'judge_prompt': [],
  'judge_response': [],
  'summa

In [65]:
for judge_model in MODEL_LIST:
  print('Judge: ' + judge_model)

  for question_idx in tqdm(df.index):
    # If the question has already been proceded (this happens when restoring a backup)
    if question_idx in dicts_jugdes_participants[judge_model]['question_id']:
        continue
        
    question = df.loc[question_idx, 'question']
    options_formatted = format_options(df.loc[question_idx, 'options'])
    correct_answer = df.loc[question_idx, 'answer_index']
    category = df.loc[question_idx, 'category']


    model_responses = ""
    for responder_model_idx, responder_model in enumerate(MODEL_LIST):
      monolothic_df = monolothic_dfs[responder_model]
      first_response = monolothic_df.loc[question_idx, 'first_response']
      second_response = monolothic_df.loc[question_idx, 'second_response']

      # If these are the original responses of the model being used as judge, we save them to calculate self-promotion later
      if responder_model == judge_model:
        judge_original_response = second_response
        try:
          judge_original_option_selected, _, judge_original_confidence = parse_select_best_option(second_response)
        except Exception:
          print('Error original response')
          judge_original_option_selected, judge_original_confidence = '', '' # Not necessary to fail in this case



      model_responses += f"-------------Model {str(responder_model_idx)} responses-------------\n{first_response}\n\n\n{second_response}\n\n"

    judge_prompt = JUDGE_COMPARISON_PROMPT.format(answers_selected=answers_selected.capitalize(),
                                                  question=question, options_formatted=options_formatted, model_responses=model_responses)


    _, judge_response = get_response(judge_model, judge_prompt, message_history=[])


    try:
        summary, selected_option, reason, confidence = parse_judge_evaluation(judge_response)
    except Exception:
        print('Error parsing response')
        continue


    dicts_jugdes_participants[judge_model]['question_id'].append(question_idx)
    dicts_jugdes_participants[judge_model]['question'].append(question)
    dicts_jugdes_participants[judge_model]['options'].append(options_formatted)
    dicts_jugdes_participants[judge_model]['correct_answer'].append(correct_answer)
    dicts_jugdes_participants[judge_model]['category'].append(category)
    dicts_jugdes_participants[judge_model]['judge_prompt'].append(judge_prompt)
    dicts_jugdes_participants[judge_model]['model_responses'].append(model_responses)
    dicts_jugdes_participants[judge_model]['judge_response'].append(judge_response)
    dicts_jugdes_participants[judge_model]['reason'].append(reason)
    dicts_jugdes_participants[judge_model]['summary'].append(summary)
    dicts_jugdes_participants[judge_model]['selected_option'].append(selected_option)
    dicts_jugdes_participants[judge_model]['confidence'].append(confidence)
    dicts_jugdes_participants[judge_model]['judge_original_response'].append(judge_original_response)
    dicts_jugdes_participants[judge_model]['judge_original_option_selected'].append(judge_original_option_selected)
    dicts_jugdes_participants[judge_model]['judge_original_confidence'].append(judge_original_confidence)

    # The output of tqdm is not complete since some judges where ran in parallel using different notebooks 
    # and the samples with parsing errors where retried afterwards  

Judge: openai/openai/gpt-4.1-nano


 25%|███████████████████▊                                                            | 149/600 [01:23<04:36,  1.63it/s]

Error parsing response


 51%|████████████████████████████████████████▉                                       | 307/600 [01:49<00:52,  5.54it/s]

Error parsing response


 65%|███████████████████████████████████████████████████▋                            | 388/600 [02:24<01:21,  2.59it/s]

Error parsing response


 69%|███████████████████████████████████████████████████████                         | 413/600 [02:31<01:06,  2.82it/s]

Error parsing response


 75%|███████████████████████████████████████████████████████████▋                    | 448/600 [02:56<01:19,  1.92it/s]

Error parsing response


 76%|████████████████████████████████████████████████████████████▍                   | 453/600 [03:18<04:29,  1.83s/it]

Error parsing response


 84%|███████████████████████████████████████████████████████████████████▎            | 505/600 [07:17<08:55,  5.63s/it]

Error parsing response


 87%|█████████████████████████████████████████████████████████████████████▍          | 521/600 [08:32<06:38,  5.04s/it]

Error parsing response


 90%|████████████████████████████████████████████████████████████████████████▎       | 542/600 [10:20<05:32,  5.74s/it]

Error parsing response


 92%|█████████████████████████████████████████████████████████████████████████▏      | 549/600 [10:58<05:03,  5.95s/it]

Error parsing response


 93%|██████████████████████████████████████████████████████████████████████████▋     | 560/600 [11:58<03:58,  5.95s/it]

Error parsing response


 96%|█████████████████████████████████████████████████████████████████████████████▏  | 579/600 [13:29<01:39,  4.73s/it]

Error parsing response


 98%|██████████████████████████████████████████████████████████████████████████████▋ | 590/600 [14:29<00:50,  5.02s/it]

Error parsing response


 99%|███████████████████████████████████████████████████████████████████████████████▎| 595/600 [14:54<00:27,  5.47s/it]

Error parsing response


100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [15:24<00:00,  1.54s/it]


Judge: anthropic/anthropic/claude-3-5-haiku-latest


  1%|▌                                                                                 | 4/600 [00:22<56:22,  5.68s/it]

Error parsing response


  2%|██                                                                               | 15/600 [01:17<50:45,  5.21s/it]

Error parsing response


  4%|███▌                                                                             | 26/600 [02:08<47:27,  4.96s/it]

Error parsing response


  7%|█████▍                                                                           | 40/600 [03:18<46:13,  4.95s/it]

Error parsing response


  9%|███████                                                                          | 52/600 [04:17<48:30,  5.31s/it]

Error parsing response


  9%|███████▌                                                                         | 56/600 [04:36<45:06,  4.97s/it]

Error parsing response


 10%|████████▌                                                                        | 63/600 [05:12<44:11,  4.94s/it]

Error parsing response


 13%|██████████▋                                                                      | 79/600 [06:32<43:07,  4.97s/it]

Error parsing response


 14%|███████████▏                                                                     | 83/600 [06:53<43:19,  5.03s/it]

Error parsing response


 18%|██████████████▏                                                                 | 106/600 [08:42<39:43,  4.83s/it]

Error parsing response


 20%|████████████████▎                                                               | 122/600 [09:53<35:11,  4.42s/it]

Error parsing response


 20%|████████████████▍                                                               | 123/600 [09:58<35:18,  4.44s/it]

Error parsing response


 21%|████████████████▋                                                               | 125/600 [10:07<35:57,  4.54s/it]

Error parsing response


 21%|████████████████▊                                                               | 126/600 [10:13<39:47,  5.04s/it]

Error parsing response


 22%|█████████████████▊                                                              | 134/600 [10:52<39:54,  5.14s/it]

Error parsing response


 28%|██████████████████████▊                                                         | 171/600 [13:48<35:09,  4.92s/it]

Error parsing response


 29%|███████████████████████▏                                                        | 174/600 [14:03<34:09,  4.81s/it]

Error parsing response


 31%|█████████████████████████                                                       | 188/600 [15:12<35:31,  5.17s/it]

Error parsing response


 32%|█████████████████████████▊                                                      | 194/600 [15:39<31:49,  4.70s/it]

Error parsing response


 38%|██████████████████████████████                                                  | 225/600 [18:03<29:14,  4.68s/it]

Error parsing response


 39%|██████████████████████████████▉                                                 | 232/600 [18:37<30:35,  4.99s/it]

Error parsing response


 41%|████████████████████████████████▊                                               | 246/600 [19:48<33:17,  5.64s/it]

Error parsing response


 44%|███████████████████████████████████▌                                            | 267/600 [21:33<27:01,  4.87s/it]

Error parsing response


 49%|███████████████████████████████████████                                         | 293/600 [23:43<24:10,  4.72s/it]

Error parsing response


 58%|██████████████████████████████████████████████▎                                 | 347/600 [28:19<23:33,  5.59s/it]

Error parsing response


 64%|███████████████████████████████████████████████████▎                            | 385/600 [31:33<18:38,  5.20s/it]

Error parsing response


 66%|█████████████████████████████████████████████████████▏                          | 399/600 [32:50<20:25,  6.10s/it]

Error parsing response


 68%|██████████████████████████████████████████████████████▊                         | 411/600 [33:56<16:17,  5.17s/it]

Error parsing response


 69%|███████████████████████████████████████████████████████                         | 413/600 [34:06<16:03,  5.15s/it]

Error parsing response


 72%|█████████████████████████████████████████████████████████▋                      | 433/600 [35:48<13:36,  4.89s/it]

Error parsing response


 74%|███████████████████████████████████████████████████████████▏                    | 444/600 [36:43<13:16,  5.10s/it]

Error parsing response


 74%|███████████████████████████████████████████████████████████▍                    | 446/600 [36:54<13:34,  5.29s/it]

Error parsing response


 74%|███████████████████████████████████████████████████████████▌                    | 447/600 [37:00<14:31,  5.70s/it]

Error original response


 75%|███████████████████████████████████████████████████████████▊                    | 449/600 [37:11<13:52,  5.52s/it]

Error parsing response


 76%|████████████████████████████████████████████████████████████▊                   | 456/600 [37:44<11:43,  4.89s/it]

Error parsing response


 78%|██████████████████████████████████████████████████████████████▌                 | 469/600 [38:51<11:44,  5.38s/it]

Error parsing response


 79%|███████████████████████████████████████████████████████████████                 | 473/600 [39:14<11:53,  5.62s/it]

Error parsing response


 79%|███████████████████████████████████████████████████████████████▍                | 476/600 [39:29<11:06,  5.38s/it]

Error parsing response


 80%|███████████████████████████████████████████████████████████████▌                | 477/600 [39:36<11:56,  5.83s/it]

Error parsing response


 83%|██████████████████████████████████████████████████████████████████▎             | 497/600 [41:17<08:41,  5.06s/it]

Error parsing response


 87%|█████████████████████████████████████████████████████████████████████▎          | 520/600 [43:16<06:26,  4.84s/it]

Error parsing response


 89%|███████████████████████████████████████████████████████████████████████▍        | 536/600 [44:57<06:35,  6.18s/it]

Error parsing response


 90%|████████████████████████████████████████████████████████████████████████▍       | 543/600 [45:38<05:37,  5.92s/it]

Error parsing response


 91%|████████████████████████████████████████████████████████████████████████▋       | 545/600 [45:49<05:13,  5.71s/it]

Error parsing response


 93%|██████████████████████████████████████████████████████████████████████████▍     | 558/600 [47:11<04:29,  6.42s/it]

Error parsing response


 93%|██████████████████████████████████████████████████████████████████████████▌     | 559/600 [47:18<04:26,  6.50s/it]

Error parsing response


 94%|██████████████████████████████████████████████████████████████████████████▊     | 561/600 [47:29<03:48,  5.86s/it]

Error parsing response


 95%|████████████████████████████████████████████████████████████████████████████    | 570/600 [48:22<02:51,  5.70s/it]

Error parsing response


 97%|█████████████████████████████████████████████████████████████████████████████▎  | 580/600 [49:29<02:12,  6.61s/it]

Error parsing response


 99%|███████████████████████████████████████████████████████████████████████████████ | 593/600 [50:47<00:45,  6.50s/it]

Error parsing response


 99%|███████████████████████████████████████████████████████████████████████████████▏| 594/600 [50:53<00:39,  6.57s/it]

Error parsing response


100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [51:31<00:00,  5.15s/it]


Judge: gemini/gemini/gemini-2.0-flash


  6%|████▊                                                                            | 36/600 [00:59<48:29,  5.16s/it]

Error parsing response


 42%|█████████████████████████████████▏                                              | 249/600 [06:17<07:08,  1.22s/it]

Error parsing response


100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [14:50<00:00,  1.48s/it]


In [94]:
dicts_jugdes_participants = {model:pd.DataFrame(data=model_dict) for model, model_dict in dicts_jugdes_participants.items()}

In [97]:
dicts_jugdes_participants[MODEL_LIST[2]]

Unnamed: 0,question_id,question,options,category,correct_answer,model_responses,judge_prompt,judge_response,summary,selected_option,reason,confidence,judge_original_response,judge_original_option_selected,judge_original_confidence
0,Compute the area of the triangle whose altitud...,Compute the area of the triangle whose altitud...,0. 56\sqrt{15}\n1. 60\sqrt{7}\n2. 240\sqrt{7}/...,math,2,-------------Model 0 responses-------------\nO...,\nYou are an expert evaluator reviewing respon...,Evaluation Summary:\nModel 0: Justifications a...,Model 0: Justifications are vague and lack spe...,2,Model 2 provides a complete and correct deriva...,1.0,Final Answer: 2\nReason: The calculation in op...,2,0.95
1,What is the relationship between the formal so...,What is the relationship between the formal so...,0. Treaties supersede custom\n1. Treaties and ...,law,9,-------------Model 0 responses-------------\nO...,\nYou are an expert evaluator reviewing respon...,Evaluation Summary:\nModel 0: Strong justifica...,"Model 0: Strong justifications, correct final ...",9,Article 38 of the ICJ Statute does not establi...,0.95,Final Answer: 9\nReason: Option 9 is stronger ...,9,0.6
2,The elasticity of supply is typically greater ...,The elasticity of supply is typically greater ...,0. producers have less capital to invest.\n1. ...,economics,6,-------------Model 0 responses-------------\nO...,\nYou are an expert evaluator reviewing respon...,Evaluation Summary:\nModel 0: Correct answer a...,Model 0: Correct answer and strong justificati...,6,Elasticity of supply refers to the responsiven...,1.0,"Final Answer: 6\nReason: Option 6, regarding t...",6,0.75
3,Which of the following conditions is a peroxis...,Which of the following conditions is a peroxis...,0. Zellweger syndrome\n1. Maple syrup urine di...,health,0,-------------Model 0 responses-------------\nO...,\nYou are an expert evaluator reviewing respon...,Evaluation Summary:\nModel 0: Correct and well...,Model 0: Correct and well-justified.\nModel 1:...,0,Zellweger syndrome is a peroxisomal biogenesis...,1.0,Final Answer: 0\nReason: Zellweger syndrome is...,0,0.95
4,"Define the following terms, making clear the d...","Define the following terms, making clear the d...","0. Byte is a collection of 4 bits, Bit is the ...",computer science,9,-------------Model 0 responses-------------\nO...,\nYou are an expert evaluator reviewing respon...,Evaluation Summary:\nModel 0: Accurate and wel...,Model 0: Accurate and well-reasoned.\nModel 1:...,9,"Option 9 correctly defines bit, byte, and word...",1.0,Final Answer: 9\nReason: Option 9 accurately d...,9,0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,Piaget’s theory describes stages of cognitive ...,Piaget’s theory describes stages of cognitive ...,0. During adolescence only\n1. From birth to l...,psychology,9,-------------Model 0 responses-------------\nO...,\nYou are an expert evaluator reviewing respon...,Evaluation Summary:\nModel 0: The justificatio...,Model 0: The justification for option 7 is inc...,4,Piaget's theory outlines stages of cognitive d...,0.95,Final Answer: 4\nReason: Option 4 more accurat...,4,0.75
594,Assume a population of garden peas in genetic ...,Assume a population of garden peas in genetic ...,0. 0.54FF + 0.12Ff + 0.34ff\n1. 0.42FF + 0.36F...,biology,5,-------------Model 0 responses-------------\nO...,\nYou are an expert evaluator reviewing respon...,Evaluation Summary:\nModel 0: Incorrect final ...,Model 0: Incorrect final answer. The justifica...,5,The initial genotype frequencies are FF = 0.36...,1.0,Final Answer: 5\nReason: Option 5 provides the...,5,0.95
595,"Which culture, previously-known as the temple-...","Which culture, previously-known as the temple-...",0. Hohokam\n1. Folsom\n2. Adena\n3. Ancestral ...,history,4,-------------Model 0 responses-------------\nO...,\nYou are an expert evaluator reviewing respon...,Evaluation Summary:\nModel 0: The justificatio...,Model 0: The justifications are accurate and r...,4,The Mississippian culture is widely recognized...,1.0,Final Answer: 4\nReason: While Moundville is r...,4,0.9
596,Which of the following is NOT a reasonable jus...,Which of the following is NOT a reasonable jus...,0. There is no other work for the processor to...,computer science,2,-------------Model 0 responses-------------\nO...,\nYou are an expert evaluator reviewing respon...,Evaluation Summary:\nModel 0: The justificatio...,Model 0: The justifications for both options a...,2,"Busy-waiting wastes CPU cycles, which is detri...",0.95,Final Answer: 2\nReason: Option 2 highlights a...,2,0.75


In [None]:
# Save the CSV files
for model_name, model_df in dicts_jugdes_participants.items():
  model_df.to_csv('judge_participant_mode_' + model_name.replace('/', '_') + '.csv')

## Evaluator-only mode

In [102]:
# Dictionaries that are going to be used to created the DataFrames storing each model's responses
dicts_jugdes_evaluator = {model: {'question_id': [], 'question': [], 'options': [], 'category': [], 'correct_answer': [], 'model_responses': [],
                     'judge_prompt': [],'judge_response':[], 'summary':[] ,'selected_option': [], 'reason': [], 'confidence': []} for model in MODEL_LIST}
dicts_jugdes_evaluator

{'openai/openai/gpt-4.1-nano': {'question_id': [],
  'question': [],
  'options': [],
  'category': [],
  'correct_answer': [],
  'model_responses': [],
  'judge_prompt': [],
  'judge_response': [],
  'summary': [],
  'selected_option': [],
  'reason': [],
  'confidence': []},
 'anthropic/anthropic/claude-3-5-haiku-latest': {'question_id': [],
  'question': [],
  'options': [],
  'category': [],
  'correct_answer': [],
  'model_responses': [],
  'judge_prompt': [],
  'judge_response': [],
  'summary': [],
  'selected_option': [],
  'reason': [],
  'confidence': []},
 'gemini/gemini/gemini-2.0-flash': {'question_id': [],
  'question': [],
  'options': [],
  'category': [],
  'correct_answer': [],
  'model_responses': [],
  'judge_prompt': [],
  'judge_response': [],
  'summary': [],
  'selected_option': [],
  'reason': [],
  'confidence': []}}

In [103]:
for judge_model in MODEL_LIST:
  print('Judge: ' + judge_model)

  for question_idx in tqdm(df.index):
    question = df.loc[question_idx, 'question']
    options_formatted = format_options(df.loc[question_idx, 'options'])
    correct_answer = df.loc[question_idx, 'answer_index']
    category = df.loc[question_idx, 'category']

    # If the question has already been proceded (this happens when restoring a backup)
    if question_idx in dicts_jugdes_evaluator[judge_model]['question_id']:
          continue

    model_responses = ""
    for responder_model_idx, responder_model in enumerate(MODEL_LIST):

      # Don't show the model its original responses
      if responder_model == judge_model:
        continue

      monolothic_df = monolothic_dfs[responder_model]
      first_response = monolothic_df.loc[question_idx, 'first_response']
      second_response = monolothic_df.loc[question_idx, 'second_response']
      model_responses += f"-------------Model {str(responder_model_idx)} responses-------------\n{first_response}\n\n\n{second_response}\n\n"

    judge_prompt = JUDGE_COMPARISON_PROMPT.format(answers_selected=answers_selected.capitalize(),
                                                  question=question, options_formatted=options_formatted, model_responses=model_responses)


    _, judge_response = get_response(judge_model, judge_prompt, message_history=[])


    try:
        summary, selected_option, reason, confidence = parse_judge_evaluation(judge_response)
    except Exception:
        print('Error parsing response')
        continue

    dicts_jugdes_evaluator[judge_model]['question_id'].append(question_idx)
    dicts_jugdes_evaluator[judge_model]['question'].append(question)
    dicts_jugdes_evaluator[judge_model]['options'].append(options_formatted)
    dicts_jugdes_evaluator[judge_model]['correct_answer'].append(correct_answer)
    dicts_jugdes_evaluator[judge_model]['category'].append(category)
    dicts_jugdes_evaluator[judge_model]['judge_prompt'].append(judge_prompt)
    dicts_jugdes_evaluator[judge_model]['model_responses'].append(model_responses)
    dicts_jugdes_evaluator[judge_model]['judge_response'].append(judge_response)
    dicts_jugdes_evaluator[judge_model]['reason'].append(reason)
    dicts_jugdes_evaluator[judge_model]['summary'].append(summary)
    dicts_jugdes_evaluator[judge_model]['selected_option'].append(selected_option)
    dicts_jugdes_evaluator[judge_model]['confidence'].append(confidence)

    # The output of tqdm is not complete since some judges where ran in parallel using different notebooks 
    # and the samples with parsing errors where retried afterwards

Judge: anthropic/anthropic/claude-3-5-haiku-latest


  0%|▏                                                                               | 1/600 [00:06<1:00:45,  6.09s/it]

Error parsing response


  2%|█▍                                                                               | 11/600 [01:00<57:35,  5.87s/it]

Error parsing response


  2%|██                                                                               | 15/600 [01:19<50:11,  5.15s/it]

Error parsing response


  4%|██▊                                                                              | 21/600 [01:51<51:36,  5.35s/it]

Error parsing response


  5%|████                                                                             | 30/600 [02:34<46:21,  4.88s/it]

Error parsing response


  6%|████▊                                                                            | 36/600 [03:08<55:13,  5.88s/it]

Error parsing response


  7%|█████▍                                                                           | 40/600 [03:29<52:41,  5.65s/it]

Error parsing response


  9%|███████                                                                          | 52/600 [04:34<53:36,  5.87s/it]

Error parsing response


 10%|████████▌                                                                        | 63/600 [05:33<51:07,  5.71s/it]

Error parsing response


 14%|███████████▏                                                                     | 83/600 [07:24<47:01,  5.46s/it]

Error parsing response


 17%|█████████████▋                                                                  | 103/600 [09:20<47:53,  5.78s/it]

Error parsing response


 18%|██████████████▏                                                                 | 106/600 [09:38<47:21,  5.75s/it]

Error parsing response


 19%|███████████████▍                                                                | 116/600 [10:32<42:43,  5.30s/it]

Error parsing response


 20%|████████████████▎                                                               | 122/600 [11:01<37:53,  4.76s/it]

Error parsing response


 20%|████████████████▍                                                               | 123/600 [11:05<37:10,  4.68s/it]

Error parsing response


 22%|█████████████████▊                                                              | 134/600 [12:01<39:48,  5.13s/it]

Error parsing response


 26%|████████████████████▋                                                           | 155/600 [13:58<40:56,  5.52s/it]

Error parsing response


 29%|███████████████████████▍                                                        | 176/600 [15:51<37:35,  5.32s/it]

Error parsing response


 31%|█████████████████████████                                                       | 188/600 [16:47<32:55,  4.79s/it]

Error parsing response


 32%|█████████████████████████▍                                                      | 191/600 [17:02<32:24,  4.75s/it]

Error parsing response


 35%|████████████████████████████▎                                                   | 212/600 [18:40<30:31,  4.72s/it]

Error parsing response


 38%|██████████████████████████████                                                  | 225/600 [19:43<30:41,  4.91s/it]

Error parsing response


 38%|██████████████████████████████▏                                                 | 226/600 [19:48<30:20,  4.87s/it]

Error parsing response


 39%|██████████████████████████████▉                                                 | 232/600 [20:15<30:30,  4.97s/it]

Error parsing response


 40%|███████████████████████████████▊                                                | 239/600 [20:52<32:03,  5.33s/it]

Error parsing response


 42%|█████████████████████████████████▏                                              | 249/600 [21:43<27:29,  4.70s/it]

Error parsing response


 44%|███████████████████████████████████▍                                            | 266/600 [23:10<24:54,  4.47s/it]

Error parsing response


 46%|████████████████████████████████████▉                                           | 277/600 [24:06<26:34,  4.94s/it]

Error parsing response


 49%|███████████████████████████████████████                                         | 293/600 [25:23<28:19,  5.54s/it]

Error parsing response


 50%|███████████████████████████████████████▌                                        | 297/600 [25:42<25:30,  5.05s/it]

Error parsing response


 53%|██████████████████████████████████████████▌                                     | 319/600 [27:41<23:25,  5.00s/it]

Error parsing response


 56%|████████████████████████████████████████████▍                                   | 333/600 [28:50<23:49,  5.36s/it]

Error parsing response


 56%|████████████████████████████████████████████▌                                   | 334/600 [28:55<23:03,  5.20s/it]

Error parsing response


 56%|█████████████████████████████████████████████▏                                  | 339/600 [29:18<20:48,  4.78s/it]

Error parsing response


 59%|███████████████████████████████████████████████▏                                | 354/600 [30:30<19:01,  4.64s/it]

Error parsing response


 60%|███████████████████████████████████████████████▋                                | 358/600 [30:54<22:58,  5.69s/it]

Error parsing response


 60%|████████████████████████████████████████████████▎                               | 362/600 [31:16<22:21,  5.64s/it]

Error parsing response


 62%|█████████████████████████████████████████████████▎                              | 370/600 [31:51<16:53,  4.41s/it]

Error parsing response


 63%|██████████████████████████████████████████████████▍                             | 378/600 [32:30<17:43,  4.79s/it]

Error parsing response


 66%|████████████████████████████████████████████████████▉                           | 397/600 [34:06<17:33,  5.19s/it]

Error parsing response


 66%|█████████████████████████████████████████████████████▏                          | 399/600 [34:14<15:29,  4.63s/it]

Error parsing response


 70%|███████████████████████████████████████████████████████▊                        | 419/600 [35:49<14:53,  4.94s/it]

Error parsing response


 72%|█████████████████████████████████████████████████████████▋                      | 433/600 [36:57<14:41,  5.28s/it]

Error parsing response


 74%|███████████████████████████████████████████████████████████▎                    | 445/600 [37:53<12:40,  4.91s/it]

Error parsing response


 74%|███████████████████████████████████████████████████████████▍                    | 446/600 [37:58<12:53,  5.02s/it]

Error parsing response


 76%|████████████████████████████████████████████████████████████▊                   | 456/600 [38:46<10:52,  4.53s/it]

Error parsing response


 79%|███████████████████████████████████████████████████████████████▏                | 474/600 [40:10<09:24,  4.48s/it]

Error parsing response


 80%|███████████████████████████████████████████████████████████████▌                | 477/600 [40:24<09:37,  4.69s/it]

Error parsing response


 81%|████████████████████████████████████████████████████████████████▌               | 484/600 [40:57<08:55,  4.62s/it]

Error parsing response


 82%|█████████████████████████████████████████████████████████████████▌              | 492/600 [41:33<07:58,  4.44s/it]

Error parsing response


 82%|█████████████████████████████████████████████████████████████████▋              | 493/600 [41:38<08:20,  4.68s/it]

Error parsing response


 83%|██████████████████████████████████████████████████████████████████▎             | 497/600 [41:58<08:41,  5.07s/it]

Error parsing response


 83%|██████████████████████████████████████████████████████████████████▌             | 499/600 [42:08<08:29,  5.05s/it]

Error parsing response


 87%|█████████████████████████████████████████████████████████████████████▎          | 520/600 [43:57<06:57,  5.21s/it]

Error parsing response


 89%|███████████████████████████████████████████████████████████████████████▍        | 536/600 [45:15<04:49,  4.52s/it]

Error parsing response


 90%|████████████████████████████████████████████████████████████████████████▍       | 543/600 [45:50<04:27,  4.69s/it]

Error parsing response


 93%|██████████████████████████████████████████████████████████████████████████▍     | 558/600 [47:02<03:26,  4.92s/it]

Error parsing response


 93%|██████████████████████████████████████████████████████████████████████████▋     | 560/600 [47:11<03:09,  4.73s/it]

Error parsing response


 96%|████████████████████████████████████████████████████████████████████████████▋   | 575/600 [48:23<02:17,  5.50s/it]

Error parsing response


 97%|█████████████████████████████████████████████████████████████████████████████▎  | 580/600 [48:50<01:46,  5.32s/it]

Error parsing response


 97%|█████████████████████████████████████████████████████████████████████████████▍  | 581/600 [48:54<01:37,  5.14s/it]

Error parsing response


 98%|██████████████████████████████████████████████████████████████████████████████▋ | 590/600 [49:40<01:00,  6.07s/it]

Error parsing response


 99%|██████████████████████████████████████████████████████████████████████████████▉ | 592/600 [49:50<00:43,  5.43s/it]

Error parsing response


 99%|███████████████████████████████████████████████████████████████████████████████ | 593/600 [49:56<00:38,  5.44s/it]

Error parsing response


 99%|███████████████████████████████████████████████████████████████████████████████▎| 595/600 [50:05<00:25,  5.18s/it]

Error parsing response


100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [50:30<00:00,  5.05s/it]


Judge: gemini/gemini/gemini-2.0-flash


  6%|████▊                                                                            | 36/600 [00:52<36:26,  3.88s/it]

Error parsing response


 18%|██████████████▏                                                                 | 106/600 [02:50<41:55,  5.09s/it]

Error parsing response


 42%|█████████████████████████████████▏                                              | 249/600 [06:29<07:29,  1.28s/it]

Error parsing response


 93%|██████████████████████████████████████████████████████████████████████████▎     | 557/600 [14:19<01:22,  1.92s/it]

Error parsing response


100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [15:17<00:00,  1.53s/it]


In [104]:
dicts_jugdes_evaluator = {model:pd.DataFrame(data=model_dict) for model, model_dict in dicts_jugdes_evaluator.items()}

In [105]:
dicts_jugdes_evaluator[MODEL_LIST[0]]

Unnamed: 0,question_id,question,options,category,correct_answer,model_responses,judge_prompt,judge_response,summary,selected_option,reason,confidence


In [106]:
# Save the CSV files
for model_name, model_df in dicts_jugdes_evaluator.items():
  model_df.to_csv('judge_evaluator_only_mode_' + model_name.replace('/', '_') + '.csv')