In [1]:
import os

from mistralai import Mistral

api_keys = os.environ["MISTRAL_API_KEYS"]
model = "mistral-large-2411"

clients = [
    Mistral(
        api_key=api_key,
    )
    for api_key in api_keys.split(",")
]

for client in clients:
    # Check API is alive
    chat_response = client.chat.complete(
        model=model,
        messages=[
            {
                "role": "user",
                "content": "What is the best French cheese?",
            },
        ],
        max_tokens=10,
    )
    print(chat_response.model, chat_response.choices[0].message.content)

mistral-large-2411 Determining the "best" French cheese can
mistral-large-2411 Choosing the "best" French cheese can be


In [7]:
from time import sleep

from mistralai import SDKError
from openai import RateLimitError

SLEEP_DURATION = 1.2
if len(clients) == 2:
    SLEEP_DURATION = 0.5
if len(clients) >= 3:
    SLEEP_DURATION = 0.2

print("Sleep duration:", SLEEP_DURATION)


def wait(duration=SLEEP_DURATION):
    sleep(duration)


api_limit_hits_by_client_ids = {}


def init_api_limits() -> None:
    global api_limit_hits_by_client_ids

    api_limit_hits_by_client_ids = {}
    for i in range(len(clients)):
        api_limit_hits_by_client_ids[i] = 0


request_id = 0


def repeat_if_hit_api_limit(f):  # (1)
    def wrapper(*args, **kw):  # (2)
        global api_limit_hits_by_client_ids

        while True:
            try:
                return f(*args, **kw)
            except RateLimitError:
                client_id = request_id % len(clients)
                api_limit_hits_by_client_ids[client_id] += 1

                total_hits = 0
                for value in api_limit_hits_by_client_ids.values():
                    total_hits += value

                if (total_hits % 10) == 0:
                    print(f"API limit hit {total_hits} times. Details: {api_limit_hits_by_client_ids}")
                wait(2)
            except SDKError as e:
                if e.status_code == 429:
                    client_id = request_id % len(clients)
                    api_limit_hits_by_client_ids[client_id] += 1

                    total_hits = 0
                    for value in api_limit_hits_by_client_ids.values():
                        total_hits += value

                    if (total_hits % 10) == 0:
                        print(f"API limit hit {total_hits} times. Details: {api_limit_hits_by_client_ids}")
                    wait(2)
                else:
                    raise e
            except Exception as e:
                print("repeat_if_hit_api_limit -> unknown error", e)
                wait(60)

    return wrapper


@repeat_if_hit_api_limit
def query_model(messages):
    global request_id
    # print(request_id % len(clients))
    client = clients[request_id % len(clients)]
    request_id += 1
    response = client.chat.complete(model=model, messages=messages)
    return response

Sleep duration: 0.5


In [8]:
import os
import sys

dir2 = os.path.abspath("")
dir1 = os.path.dirname(dir2)
if dir1 not in sys.path:
    sys.path.append(dir1)

import importlib

import utils.prompt as prompt

# Required to purge the module cache and use the latest version after an update
importlib.reload(prompt)

<module 'utils.prompt' from '/Users/aigoncharov/dev/sktech/phi-4/utils/prompt.py'>

In [9]:
import csv
import os.path
import re

import pandas as pd
from tqdm import tqdm

import utils.prompt as prompt

# difficulty = ["middle_school", "high_school", "undergraduate", "postgraduate", "phd"]
ratings = list(range(1, 11, 1))

invalid_complexities = 0
invalid_ratings = 0


def model_as_judge(df, index, system_prompt, user_prompt, answer):
    global invalid_ratings

    chat_response = query_model(
        [
            {
                "role": "system",
                "content": 'Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user request displayed below. Your evaluation should consider factors such as the following all the settings in the system prompt, correspondences to the context of the user, the helpfulness, relevance and accuracy. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example:"Rating: [[6]]".',
            },
            {
                "role": "user",
                "content": f"""
                [Instructions for Assistant]
                {system_prompt}
                [End of Instructions for Assistant]

                [Question]
                {user_prompt}
                [End of Question]

                [The Start of Assistant’s Answer]
                {answer}
                [The End of Assistant’s Answer]
                """,
            },
        ]
    )
    response = chat_response.choices[0].message.content
    # print(response)

    try:
        rating = re.search("\\[\\[(\\d+?)\\]\\]", response).group(1)
        # print(rating)
        rating_int = int(rating)
        if rating_int in ratings:
            df.at[index, "masj_num_rating"] = rating_int
        else:
            invalid_ratings += 1
    except:
        print(f"Could not extract rating from response:\n{response}\n")
        invalid_ratings += 1


def estimate_complextiy_with_model(df, index, system_prompt, user_prompt):
    global invalid_complexities

    chat_response = query_model(
        [
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": f"""
                [Question Start]
                {user_prompt}
                [Question End]
                """,
            },
        ]
    )
    response = chat_response.choices[0].message.content
    # print(response)

    try:
        complexity_str = re.search("\\[\\[(.+?)\\]\\]", response).group(1)
        # print(complexity_str)
        complexity = float(complexity_str)

        if complexity >= 0.0 and complexity <= 1.0:
            df.at[index, "masj_num_complexity"] = complexity
        else:
            invalid_complexities += 1
    except:
        print(f"Could not extract complexity from response:\n{response}\n")
        invalid_complexities += 1

    return response


DUMP_EVERY = 50


def estimate_dataset(in_filename, out_filename, get_question_from_row, get_options_from_row):
    if os.path.isfile(out_filename):
        df = pd.read_csv(
            out_filename,
            sep="\t",
            header=0,
            quoting=csv.QUOTE_NONE,
            quotechar="",
            escapechar="\\",
        )
    else:
        try:
            df = pd.read_csv(
                in_filename,
                sep="\t",
                header=0,
            )
        except:
            df = pd.read_csv(
                in_filename,
                sep="\t",
                header=0,
                quoting=csv.QUOTE_NONE,
                quotechar="",
                escapechar="\\",
            )

    global invalid_complexities
    global invalid_ratings
    invalid_complexities = 0
    invalid_ratings = 0
    init_api_limits()

    if "masj_num_complexity" not in df.columns:
        df["masj_num_complexity"] = -1.0
    if "masj_num_rating" not in df.columns:
        df["masj_num_rating"] = 0

    meaningful_iteration = 0
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        if (
            isinstance(df.at[index, "masj_num_complexity"], float)
            and df.at[index, "masj_num_complexity"] >= 0.0
            and df.at[index, "masj_num_complexity"] <= 1.0
            and df.at[index, "masj_num_rating"] in ratings
        ):
            continue

        meaningful_iteration += 1

        complexity_system_prompt = 'You are an expert in the topic of the question. Please act as an impartial judge and evaluate the complexity of the multiple-choice question with options below. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must not answer the question. You must rate the question complexity as a number from 0 to 1 following the following scale as a reference: middle_school - 0.0-0.2, high_school - 0.2-0.4, undergraduate - 0.4-0.6, postgraduate - 0.6-0.8, phd - 0.8-1.0. You must return the complexity by strictly following this format: "[[complexity]]", for example: "Complexity: [[0.55]]", which corresponds to the undergraduate level.'
        complexity_user_prompt = prompt.get_user_prompt(get_question_from_row(row), get_options_from_row(row))
        # print(complexity_user_prompt)

        response_complexity = estimate_complextiy_with_model(
            df, index, complexity_system_prompt, complexity_user_prompt
        )
        wait()

        model_as_judge(df, index, complexity_system_prompt, complexity_user_prompt, response_complexity)
        wait()

        if meaningful_iteration % DUMP_EVERY == 0:
            df.to_csv(out_filename, sep="\t", quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\", index=False)
            print(f"Over {meaningful_iteration} iterations we hit {api_limit_hits} API limits")

    df.to_csv(out_filename, sep="\t", quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\", index=False)
    print(
        f"Processed dataset {out_filename}. Total entries: {df.shape[0]}. Invalid complexities: {invalid_complexities}. Invalid ratings: {invalid_ratings}"
    )
    return df

In [10]:
# MMLU
import ast

estimate_dataset(
    in_filename="../data/mmlu_pro_stem_w_maj_w_entropyphi4.tsv",
    out_filename="../data/mmlu_pro_stem_w_numerical_maj_w_entropyphi4.tsv",
    get_question_from_row=lambda row: row["question"],
    get_options_from_row=lambda row: ast.literal_eval(row["options"]),
)

  0%|          | 52/12018 [00:05<20:55,  9.53it/s]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the steps required to solve the problem. The question involves basic arithmetic operations (subtraction and division), which are typically covered in middle school. The complexity is slightly increased by the number of options, but the core calculations remain straightforward.

Rating: [[0.12]]



 17%|█▋        | 2019/12018 [05:17<34:27,  4.84it/s]  

Over 50 iterations we hit 0 API limits


 25%|██▌       | 3040/12018 [08:15<22:16,  6.72it/s]  

Could not extract rating from response:
The assistant's explanation is clear and concise, accurately describing the principles and knowledge required to solve the problem. The question indeed involves understanding electrolysis, Faraday's laws, and the molar mass of chlorine, which are typically covered in undergraduate-level chemistry courses. The calculation involves applying these principles to determine the amount of chlorine produced, which requires a solid foundation in these concepts.

Complexity: [[0.55]]



 34%|███▍      | 4146/12018 [11:01<21:07,  6.21it/s]

Over 100 iterations we hit 0 API limits
Could not extract rating from response:
The explanation provided by the assistant is clear and concise. The assistant correctly identifies that the question involves basic arithmetic and proportional reasoning, which are typically taught in middle school mathematics classes. The assistant's evaluation is objective and relevant to the context of the question.

Complexity: [[0.1]]



 41%|████      | 4908/12018 [13:28<41:28,  2.86it/s]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the complexity of the question. The question requires an understanding of combinatorics and probability theory, specifically as it applies to poker hands. This level of understanding is typically developed in undergraduate-level mathematics or statistics courses. The assistant correctly identifies that the solution involves calculating the number of favorable outcomes (four cards of the same rank and one of a different rank) and dividing by the total number of possible outcomes (all possible poker hands). This process requires a solid foundation in the relevant mathematical concepts.

Complexity: [[0.55]]



 44%|████▍     | 5311/12018 [14:19<26:05,  4.28it/s]

Could not extract rating from response:
The assistant's response effectively breaks down the requirements to understand and answer the question, highlighting the need for knowledge in Unix file descriptors, inter-process communication, and the OKWS design. The response also correctly identifies that the question delves into advanced topics in Unix programming, which aligns with a postgraduate level of complexity. The explanation is clear, concise, and relevant to the task of evaluating the question's complexity.

Complexity: [[0.7]]



 45%|████▌     | 5413/12018 [14:40<28:11,  3.91it/s]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the requirements to answer the question correctly. The question indeed requires knowledge of various machine learning algorithms and their categorizations, which is typically covered in undergraduate-level machine learning courses. The assistant objectively assessed the complexity of the question without answering it, which aligns with the given instructions.

Complexity: [[0.45]]



 48%|████▊     | 5783/12018 [15:28<11:30,  9.03it/s]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge required to solve the given problem. The response highlights that the question involves electrical engineering concepts such as induction motors, power factor correction, and capacitor calculations, which are typically covered in undergraduate-level courses. This explanation is relevant and accurate, as it correctly identifies the complexity of the question without revealing the answer.

Complexity: [[0.55]]



 51%|█████▏    | 6165/12018 [16:42<26:39,  3.66it/s]

Over 150 iterations we hit 0 API limits


 62%|██████▏   | 7507/12018 [18:49<06:48, 11.03it/s]

Could not extract rating from response:
The[control_746]



 73%|███████▎  | 8785/12018 [21:51<08:58,  6.00it/s]

Over 200 iterations we hit 0 API limits
Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the complexity of the given multiple-choice question. The response highlights the key legal principles involved (negligence, proximate cause, foreseeability, and intervening/superseding causes) and notes the need to apply these principles to a complex fact pattern. This explanation is helpful, relevant, and accurate, as it directly corresponds to the context of the user's request and the settings in the system prompt.

The assistant's response objectively evaluates the question's complexity, considering the required legal knowledge and the need to analyze multiple events and parties. The response then rates the question's complexity as 0.7 on the given scale, corresponding to the postgraduate level. This rating seems appropriate, given the required understanding of tort law principles.

Rating: 10



 77%|███████▋  | 9227/12018 [22:53<05:46,  8.04it/s]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the complexity of the given question. The response accurately identifies that the question involves quantum tunneling and requires the application of the transmission coefficient formula derived from the Schrödinger equation. This indicates a good understanding of the context and the underlying principles needed to solve the problem.

The response also correctly notes that solving this question requires knowledge of quantum mechanics, including Planck's constant, and the ability to handle exponential functions and complex numbers. This aligns well with the postgraduate level of complexity, as it demands a deeper understanding of advanced physics concepts and mathematical skills.

Overall, the response is helpful, relevant, and accurate in assessing the complexity of the question.

Complexity: [[0.65]]



 83%|████████▎ | 9917/12018 [24:22<07:00,  5.00it/s]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the concepts required to solve the problem, highlighting the need for understanding thermodynamics and the behavior of ideal gases. The response correctly identifies that the question involves calculating the change in entropy during an isothermal reversible expansion, which is a topic typically covered in undergraduate-level chemistry or physics courses. The response also mentions the relevant formula and the need to use the ideal gas law, demonstrating a good grasp of the subject matter.

Complexity: [[0.55]]



 83%|████████▎ | 10026/12018 [27:54<4:41:39,  8.48s/it]

Over 250 iterations we hit 0 API limits


 84%|████████▎ | 10040/12018 [29:37<4:11:31,  7.63s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the principles required to solve the problem. The question involves understanding the photoelectric effect and band gap energy in semiconductors, which are typically covered in undergraduate-level physics or material science courses. The response correctly identifies that the solution requires the use of Planck's equation (E=hc/λ), which is a fundamental concept in quantum mechanics.

The explanation is helpful, relevant, and accurate, providing the necessary context for understanding the complexity of the question. The response does not answer the question directly, adhering to the instructions provided.

Rating the complexity of the question on the given scale:

Complexity: [[0.5]]



 84%|████████▎ | 10041/12018 [29:43<3:55:58,  7.16s/it]

Could not extract complexity from response:
To evaluate the complexity of this question, we need to consider the background knowledge required to understand and solve it. The question involves calculating the uncertainty of the z-component of the angular momentum ($L_z$) for a specific stationary state ($2p_z$) of the hydrogen atom. This topic falls within the realm of quantum mechanics, specifically the study of atomic structure and angular momentum.

To address this question, one needs to be familiar with:
1. **Quantum Mechanics**: Understanding of quantum states, wavefunctions, and operators.
2. **Angular Momentum**: Knowledge of angular momentum operators, particularly $L_z$, and their eigenvalues.
3. **Hydrogen Atom**: Familiarity with the stationary states of the hydrogen atom, including the $2p_z$ state.
4. **Uncertainty Principle**: Understanding how to calculate uncertainties in quantum mechanics.

Given these requirements, the question is not suitable for middle school or hig

 84%|████████▍ | 10076/12018 [34:39<4:06:58,  7.63s/it]

Over 300 iterations we hit 0 API limits


 84%|████████▍ | 10122/12018 [40:22<4:15:20,  8.08s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the concepts required to understand the question, highlighting the relationship between electromotive force (emf), the equilibrium constant (K), and the reaction quotient (Q). The response correctly identifies that this topic is typically covered in undergraduate-level chemistry courses, specifically in physical chemistry. The explanation is relevant, accurate, and helpful in understanding the complexity of the question without directly answering it.

Complexity: [[0.55]]



 84%|████████▍ | 10126/12018 [40:50<3:55:44,  7.48s/it]

Over 350 iterations we hit 0 API limits


 85%|████████▍ | 10176/12018 [46:51<4:08:22,  8.09s/it]

Over 400 iterations we hit 0 API limits


 85%|████████▌ | 10221/12018 [52:03<3:22:34,  6.76s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The problem involves understanding error propagation and the use of differentials, which are concepts typically covered in undergraduate-level physics or mathematics courses. The response correctly identifies that the question requires applying the formula for the propagation of uncertainty for the product of two measured quantities, which is indeed a more advanced topic than what is usually covered in high school.

Complexity: [[0.6]]



 85%|████████▌ | 10226/12018 [52:38<3:34:53,  7.19s/it]

Over 450 iterations we hit 0 API limits


 85%|████████▌ | 10247/12018 [54:59<3:04:53,  6.26s/it]

Could not extract rating from response:
The assistant's response provides a clear and objective evaluation of the complexity of the multiple-choice question. The explanation highlights the key factors that contribute to the question's complexity, including the need to understand principles of tort law, specifically strict liability, and the requirement to apply these principles to a detailed factual scenario. The response also notes the length of the question, the number of options, and the legal analysis required, which are all relevant considerations for determining complexity.

The complexity rating of 0.7 aligns with the postgraduate level, which seems appropriate given the detailed legal knowledge and analytical skills required to answer the question correctly.

Rating: 9



 86%|████████▌ | 10276/12018 [58:19<2:51:36,  5.91s/it]

Over 500 iterations we hit 0 API limits


 86%|████████▌ | 10326/12018 [1:04:18<3:02:16,  6.46s/it]

Over 550 iterations we hit 0 API limits


 86%|████████▌ | 10330/12018 [1:04:41<2:41:37,  5.74s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the context and the knowledge required to answer the question. The question indeed falls under the domain of international maritime law and requires an understanding of jurisdiction principles, which are typically studied at the postgraduate level, especially for those specializing in international law or maritime law. The response correctly identifies that the question involves differentiating between the roles and jurisdictions of various states and international organizations, which is a complex task.

Given the specificity and the advanced nature of the topic, the complexity rating is appropriate.

Complexity: [[0.75]]



 86%|████████▋ | 10368/12018 [1:09:12<3:52:22,  8.45s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the complexity of the question. The response highlights that the question involves understanding and applying Torricelli's principle, which is a concept in fluid dynamics and physics. It also mentions that solving the problem requires setting up and solving a differential equation, indicating the need for knowledge of calculus. This explanation is relevant and accurate, as it correctly identifies the key concepts and skills required to solve the problem. The response is helpful in giving a clear sense of the complexity of the question.

Rating the complexity as [[0.55]] is appropriate given the scale provided. The question requires understanding of physics principles and the ability to apply calculus, which are typically covered at the undergraduate level.

Complexity: [[0.55]]



 86%|████████▋ | 10371/12018 [1:09:34<3:36:43,  7.90s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The question involves basic physics concepts, specifically displacement, which is typically introduced in high school physics courses. The student must calculate the total distance traveled by adding the distance driven in the truck and the distance walked. The question involves simple addition and understanding that displacement is the total distance from the starting point to the ending point. The assessment correctly places the complexity at the high school level.

Complexity: [[0.3]]



 86%|████████▋ | 10376/12018 [1:10:12<3:29:03,  7.64s/it]

Over 600 iterations we hit 0 API limits


 87%|████████▋ | 10426/12018 [1:16:20<3:54:30,  8.84s/it]

Over 650 iterations we hit 0 API limits


 87%|████████▋ | 10470/12018 [1:21:16<2:19:17,  5.40s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The question involves the application of the Einstein-Stokes equation, which requires knowledge of physical chemistry and biochemistry, typically covered in undergraduate or postgraduate levels. The calculations involve understanding the relationship between the diffusion coefficient, molecular radius, and molecular weight, which are advanced concepts.

Complexity: [[0.65]]



 87%|████████▋ | 10476/12018 [1:21:58<3:05:25,  7.22s/it]

Over 700 iterations we hit 0 API limits


 88%|████████▊ | 10526/12018 [1:27:57<2:35:04,  6.24s/it]

Over 750 iterations we hit 0 API limits


 88%|████████▊ | 10564/12018 [1:32:39<3:33:35,  8.81s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The question requires a deep understanding of political philosophy, specifically the theories of Thomas Hobbes and John Locke, which are typically studied at the undergraduate or postgraduate level. The response highlights that answering the question necessitates specific academic knowledge rather than general knowledge.

Complexity: [[0.65]]



 88%|████████▊ | 10574/12018 [1:33:42<2:31:01,  6.28s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the principles involved in the question, highlighting the need for a solid understanding of physics, specifically wave mechanics and acoustics. The response accurately identifies the complexity of the question, which involves understanding fundamental frequencies, overtones, and harmonics, as well as the differences between stopped and open pipes. This level of understanding is typically expected at the undergraduate level.

Complexity: [[0.55]]



 88%|████████▊ | 10576/12018 [1:33:58<2:53:55,  7.24s/it]

Over 800 iterations we hit 0 API limits


 88%|████████▊ | 10597/12018 [1:36:41<2:48:09,  7.10s/it]

Could not extract complexity from response:
This question involves understanding and applying the concepts of reflection and refraction of light, specifically dealing with unpolarized light, the Fresnel equations, and the index of refraction. To solve this question, one needs to:

1. Understand the behavior of light when it hits a surface with a different refractive index.
2. Apply the Fresnel equations to determine the reflectance for both s and p polarizations.
3. Calculate the total reflectance for unpolarized light.
4. Determine the ratio of the components of the E-vector in the reflected beam.

The calculation requires knowledge of optics, specifically the behavior of light at interfaces, and the ability to apply mathematical formulas to determine the required values. This level of understanding and calculation is typically covered in undergraduate-level physics courses, specifically in optics or electromagnetism modules.

Complexity: [[complexity]]

Could not extract rating from 

 88%|████████▊ | 10626/12018 [1:40:12<2:59:09,  7.72s/it]

Over 850 iterations we hit 0 API limits


 89%|████████▊ | 10640/12018 [1:41:48<2:27:15,  6.41s/it]

Could not extract rating from response:
The explanation provided by the assistant is clear and concise, accurately describing the physical concepts and calculations required to solve the problem. The assistant correctly identifies that the question involves rotational kinematics and angular deceleration, and that the student must apply kinematic equations to find the solution. The explanation also notes the proximity of the options, highlighting the need for precise calculations.

However, the complexity rating seems to be slightly underestimated. While the question does involve undergraduate-level physics, the specific topic of rotational kinematics and the application of angular deceleration might be more typically covered in advanced high school physics or early undergraduate courses. Given the need to understand and apply these concepts, the complexity might be better rated at the higher end of the undergraduate scale.

Rating: [[0.55]]



 89%|████████▉ | 10676/12018 [1:46:17<2:32:48,  6.83s/it]

Over 900 iterations we hit 0 API limits


 89%|████████▉ | 10706/12018 [1:49:55<2:20:42,  6.43s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise evaluation of the question's complexity. The explanation highlights the need for specialized knowledge in philosophy, particularly familiarity with Peter Singer's work and the philosophies of the listed options. This level of knowledge and analytical comparison is typically expected at the postgraduate level.

Complexity: [[0.7]]



 89%|████████▉ | 10707/12018 [1:50:00<2:09:57,  5.95s/it]

Could not extract complexity from response:
This question is asking about the formal requirements for treaties, which falls under international law. To answer this question, one would need to have studied international law, specifically the Vienna Convention on the Law of Treaties (VCLT), which is typically covered in undergraduate law degrees or international relations degrees. The options provided require a nuanced understanding of the topic, including the distinction between the designation and form of a treaty, the role of ratification, and the significance of signatures by heads of state. Therefore, this question is not merely about recalling facts but applying and distinguishing legal concepts.

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the question's complexity. It accurately identifies the topic as international law and specifies the relevant legal framework, the Vienna Convention on the Law of Treaties (VCLT). 

 89%|████████▉ | 10726/12018 [1:52:19<3:26:25,  9.59s/it]

Over 950 iterations we hit 0 API limits


 90%|████████▉ | 10761/12018 [1:56:49<3:03:51,  8.78s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the complexity of the question. The question indeed requires an understanding of ethical philosophies and specific viewpoints of theorists Kamm and Sandel, particularly their arguments about the moral distinctions between treatment and enhancement. This level of understanding typically aligns with postgraduate-level studies in philosophy or bioethics, where students are expected to engage with nuanced debates and philosophical reasoning.

Complexity: [[0.75]]



 90%|████████▉ | 10776/12018 [1:58:32<2:35:14,  7.50s/it]

Over 1000 iterations we hit 0 API limits


 90%|████████▉ | 10801/12018 [2:01:30<2:53:14,  8.54s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the question's complexity. It accurately identifies that the question requires an understanding of U.S. constitutional law and the application of constitutional provisions to a specific scenario. The response highlights that the question involves analyzing the actions of a city council and a utility company within the framework of constitutional rights, and it assumes knowledge of various constitutional clauses. This analysis is relevant and accurate, as the question indeed requires a detailed understanding of constitutional law to determine the strongest grounds for a challenge.

Complexity: [[0.65]]



 90%|█████████ | 10826/12018 [2:04:29<2:14:55,  6.79s/it]

Over 1050 iterations we hit 0 API limits


 90%|█████████ | 10876/12018 [2:10:27<2:03:50,  6.51s/it]

Over 1100 iterations we hit 0 API limits


 91%|█████████ | 10902/12018 [2:13:46<2:26:06,  7.86s/it]

Could not extract rating from response:
The response provided by the AI assistant is generally well-reasoned and corresponds to the context of the user's request. The assistant correctly identifies that the question involves understanding wave mechanics, material properties, and the interplay between theoretical calculations and physical realizability. The explanation highlights the need to know specific formulas and consider conceptual understanding, which is relevant and accurate.

However, the complexity rating of [[0.75]] seems a bit high for the question at hand. While the question does require a good understanding of wave mechanics and material properties, it does not delve into highly specialized or advanced topics that would typically be associated with postgraduate or PhD-level work. Instead, it is more aligned with undergraduate-level physics, which would place it around the 0.4-0.6 range.

Rating the complexity of the question: [[0.55]]



 91%|█████████ | 10926/12018 [2:16:46<1:59:42,  6.58s/it]

Over 1150 iterations we hit 0 API limits


 91%|█████████ | 10929/12018 [2:17:02<1:45:43,  5.83s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge and skills required to solve the given chemical equilibrium problem. The response highlights the need to understand chemical equilibrium concepts, set up an ICE table, and perform algebraic calculations involving exponents. The mention of the high temperature scenario adds a layer of complexity, suggesting that the problem might be suited for undergraduate-level students who have a solid foundation in these concepts.

Complexity: [[0.50]]



 91%|█████████▏| 10976/12018 [2:22:48<1:48:51,  6.27s/it]

Over 1200 iterations we hit 0 API limits


 91%|█████████▏| 10982/12018 [2:23:32<2:05:03,  7.24s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge required to answer the question. The question indeed tests the understanding of subatomic particles and their properties, which is typically covered in undergraduate-level physics courses. The assistant objectively assessed the complexity of the question without providing the answer, adhering to the instructions given.

Complexity: [[0.5]]



 92%|█████████▏| 11026/12018 [2:29:03<1:59:51,  7.25s/it]

Over 1250 iterations we hit 0 API limits


 92%|█████████▏| 11076/12018 [2:34:43<1:38:01,  6.24s/it]

Over 1300 iterations we hit 0 API limits


 93%|█████████▎| 11126/12018 [2:40:45<1:47:52,  7.26s/it]

Over 1350 iterations we hit 0 API limits


 93%|█████████▎| 11176/12018 [2:46:38<1:40:57,  7.19s/it]

Over 1400 iterations we hit 0 API limits


 93%|█████████▎| 11186/12018 [2:47:46<1:45:13,  7.59s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge required to answer the question. The question indeed delves into the nutritional content of various food groups and their amino acid profiles, which are topics typically covered in undergraduate-level nutrition courses. The response accurately reflects the complexity of the question without providing the actual answer, adhering to the instructions given.

Complexity: [[0.55]]



 93%|█████████▎| 11190/12018 [2:48:10<1:30:14,  6.54s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The question requires an understanding of special relativity, including the Lorentz factor, relativistic mass, and kinetic energy calculations. These concepts are typically covered in advanced undergraduate or postgraduate physics courses, which aligns with the complexity rating provided.

Complexity: [[0.7]]



 93%|█████████▎| 11192/12018 [2:48:26<1:43:18,  7.50s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the steps required to solve the problem, highlighting the need to understand buoyancy, net force, and Newton's second law of motion. The explanation correctly identifies the key concepts and calculations involved, including unit conversions and algebraic manipulation. This indicates that the question is suited for an undergraduate level of complexity.

Complexity: [[0.45]]



 93%|█████████▎| 11222/12018 [2:51:42<1:40:11,  7.55s/it]

Could not extract complexity from response:
This question requires an understanding of genetic regulation, specifically the lac operon in E. coli, which is typically covered in undergraduate-level biology or molecular biology courses. The question assumes knowledge of the lac operon's components and their roles, including the repressor, lactose, and glucose. The question also requires critical thinking to apply knowledge of the lac operon to a mutation scenario. The options provided are detailed and require a solid understanding of the topic to evaluate.

Complexity:



 93%|█████████▎| 11226/12018 [2:52:13<1:48:51,  8.25s/it]

Over 1450 iterations we hit 0 API limits


 94%|█████████▎| 11251/12018 [2:55:24<1:30:27,  7.08s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge required to answer the question. The question indeed involves understanding biochemistry, specifically the biosynthesis of eicosanoids and the classification of fatty acids. This topic is typically covered in undergraduate-level biochemistry courses, making the complexity rating of 0.55 appropriate.

Complexity: [[0.55]]



 94%|█████████▍| 11272/12018 [2:57:46<1:35:46,  7.70s/it]

Could not extract complexity from response:
The question asks for an understanding of Ashford's argument regarding dependency on aid agencies among those in extreme poverty. The options are not overly complex. Although the question does require some critical thinking and understanding of philosophical and political arguments, which might be more typical of undergraduate-level courses.

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the question's complexity. It acknowledges that the question requires an understanding of philosophical and political arguments, which is typically expected at the undergraduate level. The response is helpful, relevant, and accurate in its assessment.

Complexity: [[0.5]]



 94%|█████████▍| 11276/12018 [2:58:14<1:39:22,  8.04s/it]

Over 1500 iterations we hit 0 API limits


 94%|█████████▍| 11305/12018 [3:01:42<1:28:57,  7.49s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise evaluation of the question's complexity. The explanation highlights the necessary understanding of physics concepts related to sound waves and echoes, as well as the ability to perform a simple calculation using the given speed of sound and time delay. This assessment aligns well with the high school level, where students typically learn about waves and basic physics calculations.

Complexity: [[0.35]]



 94%|█████████▍| 11326/12018 [3:04:10<1:41:42,  8.82s/it]

Over 1550 iterations we hit 0 API limits


 95%|█████████▍| 11375/12018 [3:10:03<1:20:06,  7.47s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge required to answer the question. The question indeed requires a specific understanding of molecular genetics and thalassemia mutations, which is typically covered at the undergraduate level in subjects like genetics or molecular biology. The assistant objectively assessed the complexity of the question without providing the answer, adhering to the instructions given.

Complexity: [[0.55]]



 95%|█████████▍| 11376/12018 [3:10:07<1:10:04,  6.55s/it]

Over 1600 iterations we hit 0 API limits


 95%|█████████▍| 11410/12018 [3:14:04<1:01:15,  6.05s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the principles involved in solving the problem. The question requires an understanding of the uncertainty principle, which is typically covered in undergraduate-level physics courses. The response accurately identifies the key concepts and the mathematical operations needed to solve the problem, which aligns well with the undergraduate level of complexity.

Complexity: [[0.55]]



 95%|█████████▌| 11426/12018 [3:15:45<1:00:20,  6.12s/it]

Over 1650 iterations we hit 0 API limits


 95%|█████████▌| 11457/12018 [3:19:17<56:54,  6.09s/it]  

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge required to solve the problem, specifically mentioning Snell's Law and the need to apply it using given values. This indicates that the question is likely suited for undergraduate students who have studied optics. The explanation is helpful, relevant, and accurate in the context of the question.

Complexity: [[0.45]]



 95%|█████████▌| 11476/12018 [3:21:41<1:14:42,  8.27s/it]

Over 1700 iterations we hit 0 API limits


 96%|█████████▌| 11493/12018 [3:24:24<59:47,  6.83s/it]  

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the question's complexity. It accurately identifies that the question requires an understanding of philosophical terminology and the specific work of R.M. Hare, a prominent moral philosopher. The response highlights the need to distinguish between subtle differences in meaning among the options provided, which is a characteristic of more advanced levels of study.

The complexity rating of 0.7 is appropriate, as it corresponds to the postgraduate level, reflecting the specialized knowledge and critical thinking required to answer the question.

Complexity: [[0.7]]



 96%|█████████▌| 11507/12018 [3:26:09<1:17:45,  9.13s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the question's context and the knowledge required to solve it. The question involves understanding genetic inheritance patterns, specifically X-linked recessive conditions, and applying the Hardy-Weinberg principle. This level of knowledge is typically covered in undergraduate genetics courses, making the complexity rating appropriate.

Complexity: [[0.55]]



 96%|█████████▌| 11521/12018 [3:27:49<57:27,  6.94s/it]  

Could not extract rating from response:
This evaluation focuses on the complexity of the multiple-choice question provided. The question involves the application of the Clausius-Clapeyron equation, which is a fundamental concept in physical chemistry, typically covered in undergraduate-level courses. The problem requires understanding phase equilibria, the effect of pressure on phase transitions, and the ability to apply these concepts using given data on densities and molar entropies. Additionally, it involves unit conversions and substitution into the formula to find the new equilibrium temperature.

Complexity: [[0.55]]



 96%|█████████▌| 11526/12018 [3:28:53<1:20:42,  9.84s/it]

Over 1750 iterations we hit 0 API limits
Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the principles required to solve the problem. The question involves understanding and applying the principle of conservation of mechanical energy, specifically in the context of gravitational potential energy. This level of understanding is typically expected at the postgraduate level, where students have a solid foundation in physics and can apply these principles to complex scenarios.

The assistant correctly identifies the need to convert initial kinetic energy into gravitational potential energy and the use of relevant formulas, which aligns with the complexity of postgraduate-level physics problems.

Complexity: [[0.75]]



 96%|█████████▋| 11576/12018 [3:34:59<1:11:24,  9.69s/it]

Over 1800 iterations we hit 0 API limits


 97%|█████████▋| 11606/12018 [3:38:43<51:58,  7.57s/it]  

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the principles and calculations required to solve the problem. The question involves applying Archimedes' principle to determine the weight of one truck based on the displacement of water, which is a concept typically covered in undergraduate-level physics courses. The response accurately identifies the complexity of the question by considering the necessary calculations and the underlying physical principles.

Complexity: [[0.55]]



 97%|█████████▋| 11615/12018 [3:39:54<51:03,  7.60s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The question involves calculating the relative intensities of secondary maxima in a single-slit Fraunhofer diffraction pattern, which requires a solid understanding of physical optics and the mathematical tools used to describe diffraction patterns. This topic is typically covered in undergraduate-level physics courses, making it appropriate for students at that level.

The assistant correctly identifies that solving this problem requires familiarity with mathematical descriptions such as Bessel functions or sinc functions, which are advanced concepts usually introduced in undergraduate physics curricula.

Complexity: [[0.55]]



 97%|█████████▋| 11626/12018 [3:41:22<55:25,  8.48s/it]  

Over 1850 iterations we hit 0 API limits


 97%|█████████▋| 11639/12018 [3:42:48<39:27,  6.25s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the principles behind the question, noting that it involves understanding the relationship between photon energy and wavelength in the electromagnetic spectrum. The response correctly identifies that this concept is typically introduced in high school physics but is also fundamental to undergraduate studies in physics and chemistry. The explanation is relevant, accurate, and helpful in understanding the context and complexity of the question without directly answering it.

Complexity: [[0.45]]



 97%|█████████▋| 11647/12018 [3:43:42<44:53,  7.26s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the question's context and the underlying principles it assesses. The response highlights that the question requires an understanding of Newton's laws of motion and the law of universal gravitation, which are typically covered in undergraduate-level physics courses. The explanation is relevant, accurate, and helpful in understanding the complexity of the question.

Rating the complexity of the question on a scale from 0 to 1, the assistant's evaluation of [[0.55]] is appropriate, as the question aligns with undergraduate-level physics concepts.

Complexity: [[0.55]]



 97%|█████████▋| 11668/12018 [3:46:13<49:33,  8.50s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the concepts required to solve the problem. The question involves understanding chemical equilibrium, the dissociation of $\mathrm{N}_2 \mathrm{O}_4$ into $\mathrm{NO}_2$, and the application of chemical thermodynamics principles, including the equilibrium constant and the van't Hoff equation. Additionally, the question asks for a prediction of the behavior of the degree of dissociation with an increase in temperature, which involves understanding the effect of temperature on chemical equilibrium. This level of understanding is typically expected at the undergraduate level.

The assistant correctly identifies the complexity of the question as [[0.55]], which corresponds to the undergraduate level. This rating is appropriate given the concepts and principles involved in solving the problem.



 97%|█████████▋| 11669/12018 [3:46:20<46:35,  8.01s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the complexity of the question. The response highlights that the question involves understanding chemical kinetics and applying the rate constant to determine the time for the pressure to reach a certain value. This requires setting up and solving a differential equation based on the rate law, which is typically covered in undergraduate-level chemical kinetics courses. The explanation is relevant, accurate, and helpful in understanding the complexity of the question.

Complexity: [[0.5]]



 97%|█████████▋| 11672/12018 [3:46:39<40:11,  6.97s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The problem involves understanding and applying principles of rocket propulsion, including the rocket equation, which requires knowledge of calculus and physics. This level of understanding is typically expected at the postgraduate level.

Complexity: [[0.7]]



 97%|█████████▋| 11676/12018 [3:47:07<42:45,  7.50s/it]

Over 1900 iterations we hit 0 API limits


 97%|█████████▋| 11679/12018 [3:47:36<50:46,  8.99s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation for their evaluation of the question's complexity. They accurately identify that the question requires an understanding of economics, specifically the concepts of supply and demand curves, loanable funds, and interest rates. The response also correctly notes that this topic is typically covered in undergraduate economics courses, which aligns with the given complexity scale. The assistant objectively assesses the question's complexity without providing an answer.

Complexity: [[0.55]]



 97%|█████████▋| 11690/12018 [3:48:45<33:00,  6.04s/it]

Could not extract rating from response:
The assistant's response effectively evaluates the complexity of the multiple-choice question by considering the economic principles involved and the distinction between normative and positive statements. The explanation is clear and objective, highlighting the required knowledge of economic terminology and the ability to differentiate between factual and value-based statements. The complexity rating of [[0.35]] is appropriate, placing the question at the high school level, which aligns with the level of economic understanding typically expected at that stage.

Complexity: [[0.35]]



 98%|█████████▊| 11723/12018 [3:52:49<36:49,  7.49s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the question's context, highlighting the need for basic astronomical knowledge to determine the correct answer. The question involves understanding the concepts of eclipses and the alignment of celestial bodies, which is typically covered in high school science curricula. The options include both relevant and irrelevant terms, requiring the respondent to distinguish between them based on their knowledge of astronomy.

Complexity: [[0.3]]



 98%|█████████▊| 11726/12018 [3:53:07<32:17,  6.63s/it]

Over 1950 iterations we hit 0 API limits


 98%|█████████▊| 11776/12018 [3:59:04<28:13,  7.00s/it]

Over 2000 iterations we hit 0 API limits


 98%|█████████▊| 11826/12018 [4:04:34<20:00,  6.25s/it]

Over 2050 iterations we hit 0 API limits


 99%|█████████▊| 11859/12018 [4:08:31<19:02,  7.18s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the complexity of the question. The response correctly identifies that the question deals with political economics and ideology, and that it requires critical thinking and analysis to understand the role of freedom of choice within the capitalist framework. This evaluation is objective and relevant to the context of the question.

However, the assistant's complexity rating of 0.55 is incorrect and does not align with the explanation provided. The question does not require undergraduate-level knowledge but rather a basic understanding of political ideologies and economic systems, which is typically covered in high school social studies or civics courses.

Given the topic and the level of critical thinking required, the complexity rating should be:

Complexity: [[0.35]]



 99%|█████████▉| 11876/12018 [4:10:34<18:18,  7.74s/it]

Over 2100 iterations we hit 0 API limits


 99%|█████████▉| 11925/12018 [4:16:29<11:25,  7.37s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The question involves applying the Stefan-Boltzmann law, which relates the energy radiated by a blackbody to its absolute temperature. Understanding and applying this law typically requires knowledge of physics principles covered in undergraduate-level courses. The explanation provided by the assistant is clear and relevant to the context of the question.

Complexity: [[0.55]]



 99%|█████████▉| 11926/12018 [4:16:35<10:28,  6.83s/it]

Over 2150 iterations we hit 0 API limits


 99%|█████████▉| 11930/12018 [4:17:08<11:45,  8.02s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the context and the knowledge required to address the question. The question deals with the motion of a charged particle in a magnetic field, specifically within a solenoid, which is a topic typically covered in undergraduate-level physics courses. The response correctly identifies that understanding the Lorentz force and the geometry of the magnetic field inside a solenoid is crucial for answering the question. This aligns well with the complexity scale provided, placing it in the undergraduate category.

Complexity: [[0.55]]



100%|█████████▉| 11976/12018 [4:22:50<04:57,  7.07s/it]

Over 2200 iterations we hit 0 API limits


100%|█████████▉| 11997/12018 [4:25:26<02:03,  5.89s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the economic principles and mathematical calculations required to solve the problem. The response correctly identifies that the question involves understanding marginal product, total revenue, and total cost to determine profit maximization. This level of understanding is typically expected at the undergraduate level, where students are introduced to more complex economic concepts and applications.

Complexity: [[0.55]]



100%|██████████| 12018/12018 [4:28:11<00:00,  1.34s/it]

Processed dataset ../data/mmlu_pro_stem_w_numerical_maj_w_entropyphi4.tsv. Total entries: 12018. Invalid complexities: 5. Invalid ratings: 57





Unnamed: 0,src,answer,options,category,question,question_id,answer_index,total_tokens,meta_cluster,base_cluster,masj_complexity,masj_rating,entropy_ans_correct_phi3,entropy_value_phi3,masj_num_complexity,masj_num_rating
0,ori_mmlu-jurisprudence,C,['There is no distinction between the two form...,law,Which of the following criticisms of Llewellyn...,1286,2,81,Legal Interpretation,Legal Theory Interpretations,postgraduate,9,True,0.165084,0.70,10
1,ori_mmlu-international_law,E,"['Article 19', 'Article 11', 'Article 12', 'Ar...",law,Which of the following articles are not qualif...,1293,4,38,Legal Interpretation,Constitutional Law,undergraduate,9,False,0.958569,0.70,9
2,ori_mmlu-management,D,"['Work delegation', 'Workload balancing', 'Wor...",business,As what is ensuring that one individual does n...,83,3,49,Economics & Finance MCQs,Business & Marketing Queries,high_school,9,False,0.006345,0.35,9
3,stemez-Business,J,"['$308.25', '$142.75', '$199.99', '$225.85', '...",business,Margaret Denault recently rented a truck to dr...,94,9,118,Economics & Finance MCQs,Business Finance Questions,high_school,9,False,0.456675,0.35,9
4,stemez-Business,I,"['$60,000', '$43,200', '$1,794', '$25,000', '$...",business,The tax rate in the town of Centerville is 11(...,104,8,102,Economics & Finance MCQs,Business Finance Questions,middle_school,9,False,0.506583,0.35,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12013,ori_mmlu-high_school_macroeconomics,F,['Higher interest rates that result from borro...,economics,"The ""crowding-out"" effect refers to which of t...",7681,5,150,Economics & Finance MCQs,Economic Concepts & Policies,undergraduate,9,True,0.353089,0.55,9
12014,ori_mmlu-high_school_macroeconomics,A,['Lower reserve requirements; lower the discou...,economics,Which of the following lists contains only Fed...,7683,0,124,Economics & Finance MCQs,Economic Concepts & Policies,undergraduate,10,True,0.019280,0.55,9
12015,ori_mmlu-high_school_macroeconomics,I,['The productivity of labor in country X is 75...,economics,Output in country X is 30000 units and there a...,7684,8,206,Economics & Finance MCQs,Economic Concepts & Policies,undergraduate,9,False,0.240207,0.45,9
12016,ori_mmlu-high_school_macroeconomics,B,"['an increase in net exports', 'a decrease in ...",economics,A use of easy money (expansionary) policy by t...,7685,1,58,Economics & Finance MCQs,Economic Concepts & Policies,undergraduate,9,True,0.049227,0.55,10


In [11]:
# # ARC CH
# import os.path

# import pandas as pd

# ORIGINAL_DATASET = "../data/arc_ch_validation"
# original_filename = f"{ORIGINAL_DATASET}.tsv"
# out_filename = f"{ORIGINAL_DATASET}_w_maj_complexity.tsv"

# if os.path.isfile(out_filename):
#     df = pd.read_csv(
#         out_filename,
#         sep="\t",
#         header=0,
#         quoting=csv.QUOTE_NONE,
#         quotechar="",
#         escapechar="\\",
#     )
# else:
#     df = pd.read_csv(
#         original_filename,
#         sep="\t",
#         header=0,
#     )
# # df = df.head(10)


# def get_options_arc(row):
#     try:
#         options_len = int(row["leng"])
#         options_str = row["text"]
#         options_str_without_newline = options_str.replace("\n", "")
#         options_str_without_brackets = options_str_without_newline[1:-1]
#         options_split = options_str_without_brackets.split("' '")
#         # Remove leading and trailing quotes from first and last options
#         options_split[0] = options_split[0][1:]
#         options_split[-1] = options_split[-1][:-1]
#         # print(options_split, options_len)
#         assert len(options_split) == options_len
#         for option in options_split:
#             assert len(option) > 0
#         return options_split
#     except AssertionError as e:
#         print(f"get_options_arc: {row['id']} -> AssertionError: {e}")
#         raise e


# estimate_dataset(
#     df=df,
#     get_question_from_row=lambda row: row["question"],
#     get_options_from_row=get_options_arc,
#     out_filename=out_filename,
# )

In [None]:
# # SCIQ
# from random import shuffle


# def get_options_sciq(row):
#     try:
#         options_len = 4
#         correct_option = row["correct"]
#         other_options = [row["incorrect1"], row["incorrect2"], row["incorrect3"]]
#         all_options = [correct_option] + other_options
#         shuffle(all_options)

#         assert len(all_options) == options_len
#         for option in all_options:
#             assert len(option) > 0
#         return all_options
#     except AssertionError as e:
#         print(f"get_options_sciq: {row['id']} -> AssertionError: {e}")
#         raise e


# estimate_dataset(
#     in_filename="../data/sciq_train.tsv",
#     out_filename="../data/sciq_train_w_maj_complexity.tsv",
#     get_question_from_row=lambda row: row["question"],
#     get_options_from_row=get_options_sciq,
# )
# estimate_dataset(
#     in_filename="../data/sciq_test.tsv",
#     out_filename="../data/sciq_test_w_maj_complexity.tsv",
#     get_question_from_row=lambda row: row["question"],
#     get_options_from_row=get_options_sciq,
# )
# estimate_dataset(
#     in_filename="../data/sciq_validation.tsv",
#     out_filename="../data/sciq_validation_w_maj_complexity.tsv",
#     get_question_from_row=lambda row: row["question"],
#     get_options_from_row=get_options_sciq,
# )
