In [None]:
import os

from mistralai import Mistral

api_keys = os.environ["MISTRAL_API_KEYS"]
model = "mistral-large-2411"

clients = [
    Mistral(
        api_key=api_key,
    )
    for api_key in api_keys.split(",")
]

for client in clients:
    # Check API is alive
    chat_response = client.chat.complete(
        model=model,
        messages=[
            {
                "role": "user",
                "content": "What is the best French cheese?",
            },
        ],
        max_tokens=10,
    )
    print(chat_response.model, chat_response.choices[0].message.content)

mistral-large-2411 Determining the "best" French cheese can
mistral-large-2411 Choosing the "best" French cheese can be
mistral-large-2411 Choosing the "best" French cheese can be


In [None]:
from time import sleep

from mistralai import SDKError
from openai import RateLimitError

SLEEP_DURATION = 1.2
if len(clients) == 2:
    SLEEP_DURATION = 0.5
if len(clients) >= 3:
    SLEEP_DURATION = 0

print("Sleep duration:", SLEEP_DURATION)


def wait(duration=SLEEP_DURATION):
    sleep(duration)


api_limit_hits = 0


def repeat_if_hit_api_limit(f):  # (1)
    def wrapper(*args, **kw):  # (2)
        global api_limit_hits

        while True:
            try:
                return f(*args, **kw)
            except RateLimitError:
                api_limit_hits += 1
                if (api_limit_hits % 10) == 0:
                    print(f"API limit hit {api_limit_hits} times")
                wait(1)
            except SDKError as e:
                if e.status_code == 429:
                    api_limit_hits += 1
                    if (api_limit_hits % 10) == 0:
                        print(f"API limit hit {api_limit_hits} times")
                    wait(1)
                else:
                    raise e
            except Exception as e:
                print("repeat_if_hit_api_limit -> unknown error", e)
                wait(60)

    return wrapper

Sleep duration: 0


In [3]:
request_id = 0


@repeat_if_hit_api_limit
def query_model(messages):
    global request_id
    client = clients[request_id % len(clients)]
    request_id += 1
    response = client.chat.complete(model=model, messages=messages)
    return response

In [4]:
import os
import sys

dir2 = os.path.abspath("")
dir1 = os.path.dirname(dir2)
if dir1 not in sys.path:
    sys.path.append(dir1)

import importlib

import utils.prompt as prompt

# Required to purge the module cache and use the latest version after an update
importlib.reload(prompt)

<module 'utils.prompt' from '/Users/aigoncharov/dev/sktech/phi-4/utils/prompt.py'>

In [None]:
import ast
import csv
import os.path
import re

import pandas as pd
from tqdm import tqdm

import utils.prompt as prompt

difficulty = ["middle_school", "high_school", "undergraduate", "postgraduate", "phd"]
ratings = list(range(1, 11, 1))

invalid_complexities = 0
invalid_ratings = 0


def model_as_judge(index, system_prompt, user_prompt, answer):
    global invalid_ratings

    chat_response = query_model(
        [
            {
                "role": "system",
                "content": 'Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user request displayed below. Your evaluation should consider factors such as the following all the settings in the system prompt, correspondences to the context of the user, the helpfulness, relevance and accuracy. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example:"Rating: [[6]]".',
            },
            {
                "role": "user",
                "content": f"""
                [Instructions for Assistant]
                {system_prompt}
                [End of Instructions for Assistant]

                [Question]
                {user_prompt}
                [End of Question]

                [The Start of Assistant’s Answer]
                {answer}
                [The End of Assistant’s Answer]
                """,
            },
        ]
    )
    response = chat_response.choices[0].message.content
    # print(response)

    try:
        rating = re.search("\\[\\[(\\d+?)\\]\\]", response).group(1)
        # print(rating)
        rating_int = int(rating)
        if rating_int in ratings:
            df.at[index, "masj_rating"] = rating_int
        else:
            invalid_ratings += 1
    except:
        print(f"Could not extract rating from response:\n{response}\n")
        invalid_ratings += 1


@repeat_if_hit_api_limit
def estimate_complextiy_with_model(index, system_prompt, user_prompt):
    global invalid_complexities

    chat_response = query_model(
        [
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": f"""
                [Question Start]
                {user_prompt}
                [Question End]
                """,
            },
        ]
    )
    response = chat_response.choices[0].message.content
    # print(response)

    try:
        complexity = re.search("\\[\\[(.+?)\\]\\]", response).group(1)
        # print(complexity)

        if complexity in difficulty:
            df.at[index, "masj_complexity"] = complexity
        else:
            invalid_complexities += 1
    except:
        print(f"Could not extract complexity from response:\n{response}\n")
        invalid_complexities += 1

    return response


DUMP_EVERY = 100


def estimate_dataset(df, get_question_from_row, get_options_from_row, out_filename):
    if "masj_complexity" not in df.columns:
        df["masj_complexity"] = ""
    if "masj_rating" not in df.columns:
        df["masj_rating"] = 0

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        if df.at[index, "masj_complexity"] in difficulty and df.at[index, "masj_rating"] in ratings:
            continue

        complexity_system_prompt = f'You are an expert in the topic of the question. Please act as an impartial judge and evaluate the complexity of the multiple-choice question with options below. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must not answer the question. You must rate the question complexity by strictly following the scale: {", ".join(difficulty)}. You must return the complexity by strictly following this format: "[[complexity]]", for example: "Complexity: [[middle_school]]".'
        complexity_user_prompt = prompt.get_user_prompt(get_question_from_row(row), get_options_from_row(row))

        response_complexity = estimate_complextiy_with_model(index, complexity_system_prompt, complexity_user_prompt)
        wait()

        model_as_judge(index, complexity_system_prompt, complexity_user_prompt, response_complexity)
        wait()

        if index % DUMP_EVERY == 0:
            df.to_csv(out_filename, sep="\t", quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\", index=False)
            print(f"Over {index} iterations we hit {api_limit_hits} API limits")

    df.to_csv(out_filename, sep="\t", quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\", index=False)
    print(
        f"Processed dataset {out_filename}. Total entries: {df.shape[0]}. Invalid complexities: {invalid_complexities}. Invalid ratings: {invalid_ratings}"
    )
    return df


ORIGINAL_DATASET = "../data/mmlu_pro_stem"
original_filename = f"{ORIGINAL_DATASET}.tsv"
out_filename = f"{ORIGINAL_DATASET}_w_maj_complexity.tsv"

if os.path.isfile(out_filename):
    df = pd.read_csv(
        out_filename,
        sep="\t",
        header=0,
        quoting=csv.QUOTE_NONE,
        quotechar="",
        escapechar="\\",
    )
else:
    df = pd.read_csv(
        original_filename,
        sep="\t",
        header=0,
    )
# df = df.head(10)


estimate_dataset(
    df=df,
    get_question_from_row=lambda row: row["question"],
    get_options_from_row=lambda row: ast.literal_eval(row["options"]),
    out_filename=out_filename,
)

 67%|██████▋   | 8093/12032 [10:19<6:32:46,  5.98s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the concept of divided government and the political dynamics involved. The response highlights that answering the question requires an understanding of political science concepts, which is accurate and relevant to the context of the question. The response is helpful in setting the stage for evaluating the complexity of the question.

The assistant correctly identifies that the question deals with political concepts that are typically covered in high school civics or government classes, making it accessible to students at that level while still requiring some foundational knowledge.

Complexity: [[high_school]]



 67%|██████▋   | 8101/12032 [11:06<6:04:21,  5.56s/it]

Over 8100 iterations we hit 0 API limits


 68%|██████▊   | 8132/12032 [13:48<5:34:29,  5.15s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the question's complexity. It accurately identifies that the question requires diagnostic reasoning and knowledge of medical management principles, which are typically introduced in high school biology or health sciences and further developed in undergraduate medical courses. The response objectively evaluates the question's complexity based on the prerequisite knowledge needed to answer it.

However, the assistant's complexity rating of "[[high_school]]" seems to be an underestimation. The question describes a detailed clinical scenario with specific medical terminology and requires knowledge of differential diagnoses and medical interventions, which are typically covered in undergraduate medical education rather than high school.

Given the medical nature of the question and the specific knowledge required to answer it, the complexity rating should be higher.

Rating: [[underg

 68%|██████▊   | 8146/12032 [15:05<6:40:44,  6.19s/it]

Could not extract rating from response:
The response provided by the AI assistant is thorough and well-reasoned. It breaks down the steps required to solve the problem, highlighting the need for a solid understanding of physics and engineering mechanics. The explanation is clear and corresponds well to the context of the user's request. It is helpful in outlining the knowledge and skills necessary to address the question, and it is relevant to the topic at hand. The assessment of the question's complexity as being at the undergraduate level is accurate, given the advanced concepts and calculations involved.

Rating: 9



 68%|██████▊   | 8201/12032 [20:09<9:55:18,  9.32s/it]

Over 8200 iterations we hit 0 API limits


 68%|██████▊   | 8234/12032 [23:16<5:12:14,  4.93s/it] 

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the statistical concepts involved in the question, highlighting the interrelationships between Type I and Type II errors, power, and sample size. The response accurately contextualizes the question within the framework of a clinical study and correctly identifies the level of statistical knowledge required to answer the question. The assessment that the question is likely to be answered correctly by a university student who has taken a course in statistics, but may be challenging for a high school student, is reasonable and well-justified.

Rating: 9



 69%|██████▉   | 8301/12032 [29:14<4:50:57,  4.68s/it]

Over 8300 iterations we hit 0 API limits


 69%|██████▉   | 8304/12032 [29:30<5:12:59,  5.04s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the context and requirements of the question. It accurately identifies that the question pertains to professional ethics in psychology and the need to understand APA guidelines. The response is helpful and relevant as it outlines the necessary knowledge base (ethical guidelines and professional standards) to address the question without giving away the answer. This level of understanding is typically covered in undergraduate psychology programs, particularly in courses related to ethics and professional practice.

Complexity: [[undergraduate]]



 69%|██████▉   | 8319/12032 [30:45<4:24:27,  4.27s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the question's requirements and the level of understanding needed to answer it. The question involves foundational marketing concepts, which are typically introduced in high school business or economics courses. The assistant accurately assesses that the question requires knowledge of basic marketing terms and their applications, making it suitable for a high school student who has been introduced to these concepts.

Complexity: [[high_school]]



 70%|██████▉   | 8401/12032 [37:51<4:51:16,  4.81s/it]

Over 8400 iterations we hit 0 API limits


 70%|███████   | 8471/12032 [44:02<5:13:04,  5.28s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the question's complexity. It accurately identifies that the question requires an understanding of economic concepts such as marginal revenue, marginal cost, average product, and total revenue. The response also notes that the question assumes familiarity with these concepts and the ability to analyze them critically, which is typically expected at the undergraduate level in economics. The response is helpful, relevant, and accurate in its assessment.

Rating: 10

Complexity: [[undergraduate]]



 71%|███████   | 8501/12032 [46:46<4:28:56,  4.57s/it]

Over 8500 iterations we hit 0 API limits


 71%|███████▏  | 8601/12032 [55:33<5:34:43,  5.85s/it]

Over 8600 iterations we hit 0 API limits


 72%|███████▏  | 8653/12032 [59:52<4:21:31,  4.64s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge required to answer the question. The response correctly identifies that the question deals with economic markets and the characteristics of perfect competition, which is a topic typically covered in high school economics courses. The assistant objectively assesses the complexity of the question without providing the answer, adhering to the instructions given.

Complexity: [[high_school]]



 72%|███████▏  | 8668/12032 [1:01:07<4:11:30,  4.49s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the principles required to understand and answer the question. The question involves understanding the factors that influence demand curves, which is a fundamental concept in economics typically introduced in high school economics courses. The response accurately identifies that the question requires knowledge of demand curves, the differences between shifts in demand and changes in quantity demanded, and the impacts of various factors on consumer demand. This level of understanding is generally expected of high school students who have studied economics.

Complexity: [[high_school]]



 72%|███████▏  | 8680/12032 [1:02:15<5:34:57,  6.00s/it]

Could not extract rating from response:
The response provided by the AI assistant is thorough and well-reasoned. It breaks down the evaluation into clear factors: topic familiarity, vocabulary, cognitive demand, and the number of options. Each factor is explained in a way that objectively supports the conclusion that the question is suited to an undergraduate level. The response is helpful, relevant, and accurate, providing a clear rationale for the complexity rating.

[[rating]]: 9



 72%|███████▏  | 8695/12032 [1:03:34<4:22:12,  4.71s/it]

Could not extract rating from response:
The assistant's response provides a clear and objective evaluation of the question's complexity. The explanation highlights that the question requires an understanding of economic models and their limitations, which is not immediately obvious but does not demand highly specialized knowledge. The options provided are straightforward, making the question accessible to those with a high school level of education. The assistant correctly identifies that the question goes beyond middle school knowledge but does not reach the depth typically associated with undergraduate or higher levels of study.

Complexity: [[high_school]]



 72%|███████▏  | 8701/12032 [1:04:10<5:24:52,  5.85s/it]

Over 8700 iterations we hit 0 API limits


 73%|███████▎  | 8759/12032 [1:08:42<4:12:57,  4.64s/it]

Could not extract rating from response:
The assistant's explanation is clear and concise, accurately describing the economic concepts and mathematical calculations required to solve the problem. The question indeed involves understanding real Gross National Product (GNP) and using price indices to adjust for inflation, which are typically covered in undergraduate economics courses. The assistant objectively assessed the complexity of the question without providing the answer, adhering to the instructions given.

Complexity: [[undergraduate]]



 73%|███████▎  | 8801/12032 [1:12:30<4:24:21,  4.91s/it]

Over 8800 iterations we hit 0 API limits


 74%|███████▍  | 8901/12032 [1:21:20<4:51:05,  5.58s/it]

Over 8900 iterations we hit 0 API limits


 74%|███████▍  | 8925/12032 [1:23:46<5:21:30,  6.21s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the question's requirements and the economic concepts involved. The response highlights that answering the question necessitates an understanding of various market structures and pricing strategies, which are typically covered in undergraduate-level economics courses. The explanation is relevant and accurate, effectively setting the context for evaluating the question's complexity.

Complexity: [[undergraduate]]



 74%|███████▍  | 8937/12032 [1:25:03<4:47:17,  5.57s/it]

Could not extract rating from response:
The explanation provided by the assistant is clear and objective. It accurately assesses the question's complexity by considering the biological knowledge and logical reasoning required to answer it. The assistant correctly identifies that the question does not demand specialized knowledge but rather a basic understanding of biology, which is typically covered in high school curricula. The mention of the wide range of options adding some complexity is also a valid point.

Rating: 9/10

Complexity: [[high_school]]



 75%|███████▍  | 8984/12032 [1:29:26<4:37:27,  5.46s/it]

Could not extract complexity from response:
This question requires an understanding of economic systems and the role of prices within them. It involves comparing and contrasting two different economic models, understanding how prices are determined in each, and evaluating the consequences of these differences. This type of analysis is typically covered in high school economics courses or introductory undergraduate economics courses.
The options provided vary in their accuracy and relevance, requiring a solid foundational knowledge of economics to evaluate correctly.



 75%|███████▍  | 9001/12032 [1:31:02<3:53:20,  4.62s/it]

Over 9000 iterations we hit 0 API limits


 76%|███████▌  | 9101/12032 [1:40:46<3:42:19,  4.55s/it]

Over 9100 iterations we hit 0 API limits


 76%|███████▌  | 9132/12032 [1:43:35<4:02:35,  5.02s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge and concepts required to address the question. The response highlights that the question involves understanding fiscal policy, the impact of tax cuts on savings, and the concept of the marginal propensity to consume (MPC). It also mentions the need to apply these concepts to a specific scenario, which is a common requirement in undergraduate-level economics courses. The explanation is relevant, accurate, and helpful in understanding the complexity of the question.

Complexity: [[undergraduate]]



 76%|███████▋  | 9201/12032 [1:49:45<3:53:25,  4.95s/it]

Over 9200 iterations we hit 0 API limits


 77%|███████▋  | 9258/12032 [1:55:00<3:21:58,  4.37s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The question involves understanding solid-state chemistry, specifically the calculation of theoretical density from unit cell parameters. This topic is typically covered in undergraduate-level chemistry or materials science courses. The response correctly identifies that solving the problem requires knowledge of the formula for theoretical density, unit cell dimensions, the number of atoms in the unit cell, and atomic mass.

Complexity: [[undergraduate]]



 77%|███████▋  | 9274/12032 [1:56:21<3:19:24,  4.34s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the question's complexity. The response highlights that the question requires an understanding of specific evolutionary processes and their applications to ecological relationships, which is accurate. The question indeed necessitates familiarity with various evolutionary concepts and their definitions, as well as how these concepts manifest in real-world biological interactions. This level of understanding is typically expected at the undergraduate level in biology or related fields.

Complexity: [[undergraduate]]



 77%|███████▋  | 9282/12032 [1:57:13<4:13:58,  5.54s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the complexities involved in the question. The question indeed requires an understanding of criminal law, specifically the concepts of intent, recklessness, and the use of deadly weapons in relation to murder charges. The response highlights the need for critical thinking, legal knowledge, and the ability to interpret and apply legal principles to a given scenario, which are typically skills developed at the undergraduate level in law or related fields.

Complexity: [[undergraduate]]



 77%|███████▋  | 9296/12032 [1:58:39<3:53:53,  5.13s/it]

Could not extract rating from response:
The assistant's response provides a clear and objective evaluation of the question's complexity. The explanation highlights the need for a solid understanding of historical events and military interventions, specifically those involving NATO. This level of knowledge is typically expected at the undergraduate level, where students are likely to have studied these topics in more depth than at the high school level, but the question does not require the advanced analytical or theoretical frameworks typically expected at the postgraduate or PhD levels.

Complexity: [[undergraduate]]



 77%|███████▋  | 9301/12032 [1:59:05<3:40:48,  4.85s/it]

Over 9300 iterations we hit 0 API limits


 77%|███████▋  | 9312/12032 [2:00:07<35:05,  1.29it/s]  


KeyboardInterrupt: 