In [1]:
import os

from mistralai import Mistral

api_keys = os.environ["MISTRAL_API_KEYS"]
model = "mistral-large-2411"

clients = [
    Mistral(
        api_key=api_key,
    )
    for api_key in api_keys.split(",")
]

for client in clients:
    # Check API is alive
    chat_response = client.chat.complete(
        model=model,
        messages=[
            {
                "role": "user",
                "content": "What is the best French cheese?",
            },
        ],
        max_tokens=10,
    )
    print(chat_response.model, chat_response.choices[0].message.content)

mistral-large-2411 Choosing the "best" French cheese can be
mistral-large-2411 Determining the "best" French cheese can
mistral-large-2411 Determining the "best" French cheese can


In [2]:
from time import sleep

from mistralai import SDKError
from openai import RateLimitError

SLEEP_DURATION = 1.2
if len(clients) == 2:
    SLEEP_DURATION = 0.5
if len(clients) >= 3:
    SLEEP_DURATION = 0

print("Sleep duration:", SLEEP_DURATION)


def wait(duration=SLEEP_DURATION):
    sleep(duration)


api_limit_hits = 0


def repeat_if_hit_api_limit(f):  # (1)
    def wrapper(*args, **kw):  # (2)
        global api_limit_hits

        while True:
            try:
                return f(*args, **kw)
            except RateLimitError:
                api_limit_hits += 1
                if (api_limit_hits % 10) == 0:
                    print(f"API limit hit {api_limit_hits} times")
                wait(1)
            except SDKError as e:
                if e.status_code == 429:
                    api_limit_hits += 1
                    if (api_limit_hits % 10) == 0:
                        print(f"API limit hit {api_limit_hits} times")
                    wait(1)
                else:
                    raise e
            except Exception as e:
                print("repeat_if_hit_api_limit -> unknown error", e)
                wait(60)

    return wrapper

Sleep duration: 0


In [3]:
request_id = 0


@repeat_if_hit_api_limit
def query_model(messages):
    global request_id
    client = clients[request_id % len(clients)]
    request_id += 1
    response = client.chat.complete(model=model, messages=messages)
    return response

In [4]:
import os
import sys

dir2 = os.path.abspath("")
dir1 = os.path.dirname(dir2)
if dir1 not in sys.path:
    sys.path.append(dir1)

import importlib

import utils.prompt as prompt

# Required to purge the module cache and use the latest version after an update
importlib.reload(prompt)

<module 'utils.prompt' from '/Users/aigoncharov/dev/sktech/phi-4/utils/prompt.py'>

In [5]:
import ast
import csv
import os.path
import re

import pandas as pd
from tqdm import tqdm

import utils.prompt as prompt

difficulty = ["middle_school", "high_school", "undergraduate", "postgraduate", "phd"]
ratings = list(range(1, 11, 1))

invalid_complexities = 0
invalid_ratings = 0


def model_as_judge(index, system_prompt, user_prompt, answer):
    global invalid_ratings

    chat_response = query_model(
        [
            {
                "role": "system",
                "content": 'Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user request displayed below. Your evaluation should consider factors such as the following all the settings in the system prompt, correspondences to the context of the user, the helpfulness, relevance and accuracy. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example:"Rating: [[6]]".',
            },
            {
                "role": "user",
                "content": f"""
                [Instructions for Assistant]
                {system_prompt}
                [End of Instructions for Assistant]

                [Question]
                {user_prompt}
                [End of Question]

                [The Start of Assistant’s Answer]
                {answer}
                [The End of Assistant’s Answer]
                """,
            },
        ]
    )
    response = chat_response.choices[0].message.content
    # print(response)

    try:
        rating = re.search("\\[\\[(\\d+?)\\]\\]", response).group(1)
        # print(rating)
        rating_int = int(rating)
        if rating_int in ratings:
            df.at[index, "masj_rating"] = rating_int
        else:
            invalid_ratings += 1
    except:
        print(f"Could not extract rating from response:\n{response}\n")
        invalid_ratings += 1


@repeat_if_hit_api_limit
def estimate_complextiy_with_model(index, system_prompt, user_prompt):
    global invalid_complexities

    chat_response = query_model(
        [
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": f"""
                [Question Start]
                {user_prompt}
                [Question End]
                """,
            },
        ]
    )
    response = chat_response.choices[0].message.content
    # print(response)

    try:
        complexity = re.search("\\[\\[(.+?)\\]\\]", response).group(1)
        # print(complexity)

        if complexity in difficulty:
            df.at[index, "masj_complexity"] = complexity
        else:
            invalid_complexities += 1
    except:
        print(f"Could not extract complexity from response:\n{response}\n")
        invalid_complexities += 1

    return response


DUMP_EVERY = 100


def estimate_dataset(df, get_question_from_row, get_options_from_row, out_filename):
    if "masj_complexity" not in df.columns:
        df["masj_complexity"] = ""
    if "masj_rating" not in df.columns:
        df["masj_rating"] = 0

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        if df.at[index, "masj_complexity"] in difficulty and df.at[index, "masj_rating"] in ratings:
            continue

        complexity_system_prompt = f'You are an expert in the topic of the question. Please act as an impartial judge and evaluate the complexity of the multiple-choice question with options below. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must not answer the question. You must rate the question complexity by strictly following the scale: {", ".join(difficulty)}. You must return the complexity by strictly following this format: "[[complexity]]", for example: "Complexity: [[middle_school]]".'
        complexity_user_prompt = prompt.get_user_prompt(get_question_from_row(row), get_options_from_row(row))

        response_complexity = estimate_complextiy_with_model(index, complexity_system_prompt, complexity_user_prompt)
        wait()

        model_as_judge(index, complexity_system_prompt, complexity_user_prompt, response_complexity)
        wait()

        if index % DUMP_EVERY == 0:
            df.to_csv(out_filename, sep="\t", quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\", index=False)
            print(f"Over {index} iterations we hit {api_limit_hits} API limits")

    df.to_csv(out_filename, sep="\t", quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\", index=False)
    print(
        f"Processed dataset {out_filename}. Total entries: {df.shape[0]}. Invalid complexities: {invalid_complexities}. Invalid ratings: {invalid_ratings}"
    )
    return df


ORIGINAL_DATASET = "../data/mmlu_pro_stem"
original_filename = f"{ORIGINAL_DATASET}.tsv"
out_filename = f"{ORIGINAL_DATASET}_w_maj_complexity.tsv"

if os.path.isfile(out_filename):
    df = pd.read_csv(
        out_filename,
        sep="\t",
        header=0,
        quoting=csv.QUOTE_NONE,
        quotechar="",
        escapechar="\\",
    )
else:
    df = pd.read_csv(
        original_filename,
        sep="\t",
        header=0,
    )
# df = df.head(10)


estimate_dataset(
    df=df,
    get_question_from_row=lambda row: row["question"],
    get_options_from_row=lambda row: ast.literal_eval(row["options"]),
    out_filename=out_filename,
)

 61%|██████▏   | 7393/12032 [00:00<00:00, 73919.67it/s]

Could not extract complexity from response:
To evaluate the complexity of this question, we need to consider the concepts it involves and the level of understanding required to answer it. The question asks about Hermitian operators, which are a concept from linear algebra and quantum mechanics. Specifically, it asks about the Hermiticity of differential operators, which is a topic covered in advanced undergraduate or postgraduate courses in mathematics, physics, or engineering.

Understanding Hermitian operators requires a solid foundation in linear algebra, calculus, and differential equations. Moreover, the question requires knowledge of the properties of specific differential operators and how to determine if they are Hermitian. This level of specialized knowledge is typically not covered in high school or middle school curricula.

Complexity: [postgraduate]



 84%|████████▍ | 10101/12032 [11:34<3:03:35,  5.70s/it]

Over 10100 iterations we hit 0 API limits


 85%|████████▍ | 10201/12032 [21:24<2:41:19,  5.29s/it]

Over 10200 iterations we hit 0 API limits


 85%|████████▍ | 10208/12032 [22:02<2:28:20,  4.88s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge required to answer the question. The question involves understanding periodic trends in electronegativity and bond strength, which are typically covered in undergraduate-level chemistry courses. The response accurately reflects the complexity of the question by noting that it requires applying fundamental chemical concepts to determine acid strength.

Complexity: [[undergraduate]]



 85%|████████▌ | 10280/12032 [28:48<2:47:17,  5.73s/it]

Could not extract rating from response:
The assistant's response provides a clear and objective evaluation of the complexity of the multiple-choice question. The response breaks down the steps required to solve the problem, highlighting the need for understanding chemical reactions, stoichiometry, and molar masses. This detailed explanation aligns well with the typical curriculum covered in high school chemistry, making the complexity rating appropriate.

Rating: 10



 85%|████████▌ | 10285/12032 [29:17<2:48:58,  5.80s/it]

Could not extract rating from response:
The assistant's response demonstrates a clear understanding of the accounting principles involved in the question, particularly the concept of commercial substance in asset exchanges. The explanation correctly identifies that the question aligns with topics covered in intermediate accounting courses, typically taken at the undergraduate level. The response also highlights the key considerations—such as book value, fair value, tax implications, and future cash flows—that are relevant to determining commercial substance. This contextual understanding is crucial for evaluating the complexity of the question.

Rating: [[undergraduate]]



 86%|████████▌ | 10301/12032 [30:42<2:50:40,  5.92s/it]

Over 10300 iterations we hit 0 API limits


 86%|████████▌ | 10336/12032 [33:52<2:53:30,  6.14s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The question involves understanding specific legal concepts such as burglary, larceny, arson, and criminal negligence, as well as applying these concepts to a hypothetical scenario. This level of legal analysis typically requires knowledge gained at the undergraduate level or above, especially in courses related to law or criminal justice. The scenario also involves understanding the nuances of common law, which adds to the complexity.

Rating: [[undergraduate]]



 86%|████████▌ | 10344/12032 [34:29<2:06:06,  4.48s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise evaluation of the question's complexity. The explanation highlights the need for an understanding of feminist theory and the ability to discern nuances between different feminist viewpoints, which are typically covered in undergraduate-level studies. The response correctly identifies that the question requires familiarity with the concept of "difference feminism" and the ability to critically evaluate and differentiate between various statements.

The assistant correctly rates the complexity of the question as "[[undergraduate]]".



 86%|████████▌ | 10358/12032 [35:45<2:28:17,  5.32s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the complexity of the given multiple-choice question. The response correctly identifies that the question involves understanding chemical equilibrium, specifically the dissolution of a precipitate using a complex ion formation. It also mentions the need to set up an ICE table and use given equilibrium constants, which requires algebraic manipulation and understanding of molarity calculations. This explanation is relevant, accurate, and helpful in determining the complexity of the question.

Rating: 10



 86%|████████▋ | 10401/12032 [39:36<3:03:25,  6.75s/it]

Over 10400 iterations we hit 0 API limits


 87%|████████▋ | 10437/12032 [43:10<2:23:29,  5.40s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the physics concepts involved in the question, highlighting the need to understand rolling friction, power, and force calculations. The response also mentions the necessity of unit conversions, which adds to the complexity of the problem. This level of understanding is typically covered in high school physics courses, making the assessment appropriate.

Complexity: [[high_school]]



 87%|████████▋ | 10457/12032 [45:10<2:11:36,  5.01s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the concepts required to solve the problem. The question involves understanding angular acceleration and converting units from revolutions per minute (rpm) to radians per second, which are typically covered in high school physics courses. The response accurately identifies the complexity of the question by considering the prerequisite knowledge and skills needed to solve it.

Complexity: [[high_school]]



 87%|████████▋ | 10501/12032 [49:24<1:55:39,  4.53s/it]

Over 10500 iterations we hit 0 API limits


 88%|████████▊ | 10545/12032 [53:49<2:25:43,  5.88s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The problem involves calculating the equilibrium constant for a chemical reaction, which is a fundamental concept in chemical equilibrium. This topic is typically introduced in high school chemistry courses, where students learn about the law of mass action and how to use it to determine equilibrium constants. The calculation requires basic algebraic manipulation and an understanding of chemical concentrations, which are within the skill set of a high school student who has studied chemistry.

Complexity: [[high_school]]



 88%|████████▊ | 10601/12032 [59:20<2:12:59,  5.58s/it]

Over 10600 iterations we hit 0 API limits


 89%|████████▊ | 10650/12032 [1:04:23<1:53:00,  4.91s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The problem involves stoichiometry, which is typically introduced in high school chemistry courses. Students need to understand how to balance chemical equations and use molar ratios to determine the volumes of reactants and products. This level of understanding is generally expected at the high school level.

Complexity: [[high_school]]



 89%|████████▊ | 10677/12032 [1:06:46<2:12:05,  5.85s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge and skills required to solve the problem. The question involves understanding the concept of isotopes and calculating the average atomic mass based on the given data. This requires basic mathematical operations such as multiplication and addition, as well as understanding percentages and converting them to decimal form. These are typically skills taught at the high school level.

The assistant accurately identifies the complexity of the question by considering the necessary background knowledge and mathematical skills. The explanation is objective and aligns well with the educational standards expected at the high school level.

Complexity: [[high_school]]



 89%|████████▉ | 10701/12032 [1:09:00<1:55:24,  5.20s/it]

Over 10700 iterations we hit 0 API limits


 90%|████████▉ | 10801/12032 [1:18:30<1:58:17,  5.77s/it]

Over 10800 iterations we hit 0 API limits


 90%|█████████ | 10864/12032 [1:24:55<2:37:46,  8.10s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of why the question requires a certain level of complexity. The response highlights that the question involves understanding specific philosophical concepts and the views of particular philosophers, namely Anscombe and Sidgwick. This indicates that the question is not merely about general knowledge but requires a deeper understanding of ethical theories and their interpretations, which is typically covered in undergraduate-level philosophy courses.

The assistant correctly identifies that the question is complex because it involves nuanced philosophical ideas and the ability to differentiate between the views of different philosophers. This level of understanding is usually developed through more advanced study, making the question suitable for an undergraduate-level complexity rating.

Rating: [[undergraduate]]



 90%|█████████ | 10867/12032 [1:25:10<1:57:22,  6.05s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The question involves several steps of chemical calculations, including determining the molarity of the sulfuric acid solution, using the stoichiometry of the neutralization reaction, and applying the principles of titration to find the concentration of the sodium hydroxide solution. These concepts are typically covered in undergraduate-level chemistry courses, making the complexity rating appropriate.

Rating: [[undergraduate]]



 91%|█████████ | 10901/12032 [1:28:40<1:37:23,  5.17s/it]

Over 10900 iterations we hit 0 API limits


 91%|█████████▏| 11001/12032 [1:38:10<1:23:26,  4.86s/it]

Over 11000 iterations we hit 0 API limits


 92%|█████████▏| 11101/12032 [1:48:09<1:21:33,  5.26s/it]

Over 11100 iterations we hit 0 API limits


 92%|█████████▏| 11128/12032 [1:50:46<1:20:38,  5.35s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The question involves principles of classical mechanics, specifically the conservation of momentum, which is typically introduced in high school physics courses. The problem requires setting up equations and solving for unknowns, which are skills developed in high school algebra and physics classes. The context and the steps required to solve the problem align well with the high school curriculum.

Complexity: [[high_school]]



 93%|█████████▎| 11138/12032 [1:51:44<1:19:19,  5.32s/it]

Could not extract complexity from response:
This question requires an understanding of basic physics principles, specifically kinematics and the properties of sound. To solve the problem, one must know the equations of motion under gravity and the relationship between speed, distance, and time for sound. The student needs to calculate the time it takes for the balloon to hit the ground and then use the speed of sound to determine the height of the building. This involves setting up and solving algebraic equations, which requires a solid foundation in high school-level physics and mathematics.

Complexity: undergraduate



 93%|█████████▎| 11201/12032 [1:57:38<1:15:50,  5.48s/it]

Over 11200 iterations we hit 0 API limits


 94%|█████████▍| 11295/12032 [2:07:45<1:27:27,  7.12s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The problem involves understanding heat transfer and the principle of conservation of energy, which are typically covered in high school physics. The question requires setting up equations based on given temperatures and solving for an unknown final temperature, which is a standard high school-level problem-solving skill.

Complexity: [[high_school]]



 94%|█████████▍| 11301/12032 [2:08:16<1:05:55,  5.41s/it]

Over 11300 iterations we hit 0 API limits
Could not extract complexity from response:
To evaluate the complexity of this question, we need to consider the specialized knowledge required to understand and answer it correctly. The question deals with mycotoxins, which are secondary metabolites produced by microfungi that are capable of causing disease and death in humans and other animals. Understanding the formation and contamination pathways of mycotoxins requires knowledge of microbiology, food science, and potentially agricultural sciences.

The options provided in the question require the examinee to differentiate between various biological agents (bacteria, fungi, algae, viruses, yeasts) and environmental conditions (moist, high-light, wet, freezing temperatures, high-pressure, dry, airtight). This level of detail and specificity suggests that the question is not suited for middle school or high school students, who typically have a more general understanding of biology and food sc

 95%|█████████▍| 11375/12032 [2:18:12<1:31:37,  8.37s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge required to answer the question. The question indeed involves understanding human physiology, specifically the process of protein oxidation and its measurement, which is typically covered in undergraduate-level courses in fields such as biology, physiology, or nutrition. The response is helpful, relevant, and accurate in the context of evaluating the question's complexity.

Rating the complexity of the question:
Complexity: [[undergraduate]]



 95%|█████████▍| 11401/12032 [2:20:38<51:17,  4.88s/it]  

Over 11400 iterations we hit 0 API limits


 95%|█████████▌| 11441/12032 [2:24:26<48:39,  4.94s/it]  

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question by considering the medical terminology and statistical knowledge required to understand and answer it. The question involves understanding bone density conditions (osteopenia and osteoporosis), their diagnostic criteria, and prevalence data among specific age groups in North America and Western Europe. This level of knowledge is typically covered in undergraduate studies, particularly in health sciences or related fields.

Complexity: [[undergraduate]]



 96%|█████████▌| 11501/12032 [2:30:32<54:36,  6.17s/it]  

Over 11500 iterations we hit 0 API limits


 96%|█████████▋| 11600/12032 [2:39:29<31:29,  4.37s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The question requires an understanding of optical principles, specifically the use of the mirror formula and concepts related to concave mirrors, such as focal length and radius of curvature. Solving the problem involves applying these principles and performing mathematical calculations, which are typically covered in undergraduate-level physics courses. The response is clear, relevant, and accurately reflects the educational level at which this topic is usually taught.

Complexity: [[undergraduate]]



 96%|█████████▋| 11601/12032 [2:39:37<39:00,  5.43s/it]

Over 11600 iterations we hit 0 API limits


 97%|█████████▋| 11701/12032 [2:48:51<24:31,  4.45s/it]

Over 11700 iterations we hit 0 API limits


 98%|█████████▊| 11801/12032 [2:58:31<19:32,  5.08s/it]

Over 11800 iterations we hit 0 API limits


 98%|█████████▊| 11806/12032 [2:58:58<18:44,  4.98s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge required to solve the problem. The question indeed involves understanding atomic physics, specifically the Balmer series and electron transitions in the hydrogen spectrum. This topic is typically covered in undergraduate-level physics courses, making the complexity rating appropriate.

Complexity: [[undergraduate]]



 98%|█████████▊| 11832/12032 [3:01:36<20:05,  6.03s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The question involves understanding the principles of optical physics, specifically the properties of concave mirrors and the mirror formula. This topic is typically covered in undergraduate physics courses, as it requires a solid foundation in optics and the ability to apply theoretical knowledge to practical problems. The question is not merely about recalling facts but about applying concepts to calculate the displacement of an image, which aligns with the complexity level of undergraduate studies.

Rating the complexity: [[undergraduate]]



 99%|█████████▉| 11901/12032 [3:07:55<11:43,  5.37s/it]

Over 11900 iterations we hit 0 API limits


100%|█████████▉| 12001/12032 [3:16:40<02:58,  5.75s/it]

Over 12000 iterations we hit 0 API limits


100%|██████████| 12032/12032 [3:19:34<00:00,  1.00it/s]

Processed dataset ../data/mmlu_pro_stem_w_maj_complexity.tsv. Total entries: 12032. Invalid complexities: 12. Invalid ratings: 20





Unnamed: 0,src,answer,options,category,question,cot_content,question_id,answer_index,total_tokens,meta_cluster,base_cluster,masj_complexity,masj_rating
0,ori_mmlu-jurisprudence,C,['There is no distinction between the two form...,law,Which of the following criticisms of Llewellyn...,,1286,2,81,Legal Interpretation,Legal Theory Interpretations,postgraduate,9
1,ori_mmlu-international_law,E,"['Article 19', 'Article 11', 'Article 12', 'Ar...",law,Which of the following articles are not qualif...,,1293,4,38,Legal Interpretation,Constitutional Law,undergraduate,9
2,ori_mmlu-management,D,"['Work delegation', 'Workload balancing', 'Wor...",business,As what is ensuring that one individual does n...,,83,3,49,Economics & Finance MCQs,Business & Marketing Queries,high_school,9
3,stemez-Business,J,"['$308.25', '$142.75', '$199.99', '$225.85', '...",business,Margaret Denault recently rented a truck to dr...,,94,9,118,Economics & Finance MCQs,Business Finance Questions,high_school,9
4,stemez-Business,I,"['$60,000', '$43,200', '$1,794', '$25,000', '$...",business,The tax rate in the town of Centerville is 11(...,,104,8,102,Economics & Finance MCQs,Business Finance Questions,middle_school,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12027,ori_mmlu-high_school_macroeconomics,F,['Higher interest rates that result from borro...,economics,"The ""crowding-out"" effect refers to which of t...",,7681,5,150,Economics & Finance MCQs,Economic Concepts & Policies,undergraduate,9
12028,ori_mmlu-high_school_macroeconomics,A,['Lower reserve requirements; lower the discou...,economics,Which of the following lists contains only Fed...,,7683,0,124,Economics & Finance MCQs,Economic Concepts & Policies,undergraduate,10
12029,ori_mmlu-high_school_macroeconomics,I,['The productivity of labor in country X is 75...,economics,Output in country X is 30000 units and there a...,,7684,8,206,Economics & Finance MCQs,Economic Concepts & Policies,undergraduate,9
12030,ori_mmlu-high_school_macroeconomics,B,"['an increase in net exports', 'a decrease in ...",economics,A use of easy money (expansionary) policy by t...,,7685,1,58,Economics & Finance MCQs,Economic Concepts & Policies,undergraduate,9
