In [1]:
import os
from mistralai import Mistral

api_keys = os.environ["MISTRAL_API_KEYS"]
model = "mistral-large-2411"

clients = [
    Mistral(
        api_key=api_key,
    )
    for api_key in api_keys.split(",")
]

for client in clients:
    # Check API is alive
    chat_response = client.chat.complete(
        model=model,
        messages=[
            {
                "role": "user",
                "content": "What is the best French cheese?",
            },
        ],
        max_tokens=10,
    )
    print(chat_response.model, chat_response.choices[0].message.content)

mistral-large-2411 Choosing the "best" French cheese can be
mistral-large-2411 Choosing the "best" French cheese can be
mistral-large-2411 Determining the "best" French cheese can


In [6]:
from openai import RateLimitError
from mistralai import SDKError
from time import sleep

SLEEP_DURATION = 1.2
if len(clients) == 2:
    SLEEP_DURATION = 0.5
if len(clients) >= 3:
    SLEEP_DURATION = 0

print("Sleep duration:", SLEEP_DURATION)


def wait(duration=SLEEP_DURATION):
    sleep(duration)


api_limit_hits = 0


def repeat_if_hit_api_limit(f):  # (1)
    def wrapper(*args, **kw):  # (2)
        global api_limit_hits

        while True:
            try:
                return f(*args, **kw)
            except RateLimitError:
                api_limit_hits += 1
                if (api_limit_hits % 10) == 0:
                    print(f"API limit hit {api_limit_hits} times")
                wait(1)
            except SDKError as e:
                if e.status_code == 429:
                    api_limit_hits += 1
                    if (api_limit_hits % 10) == 0:
                        print(f"API limit hit {api_limit_hits} times")
                    wait(1)
                else:
                    raise e
            except Exception as e:
                print("repeat_if_hit_api_limit -> unknown error", e)
                wait(60)

    return wrapper

Sleep duration: 0


In [7]:
request_id = 0


@repeat_if_hit_api_limit
def query_model(messages):
    global request_id
    client = clients[request_id % len(clients)]
    request_id += 1
    response = client.chat.complete(model=model, messages=messages)
    return response

In [8]:
import os, sys

dir2 = os.path.abspath("")
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

import utils.prompt as prompt

import importlib

# Required to purge the module cache and use the latest version after an update
importlib.reload(prompt)

<module 'utils.prompt' from '/Users/aigoncharov/dev/sktech/phi-4/utils/prompt.py'>

In [9]:
import pandas as pd
import ast
import csv
from tqdm import tqdm
import re
import os.path

import utils.prompt as prompt

difficulty = ["middle_school", "high_school", "undergraduate", "postgraduate", "phd"]
ratings = list(range(1, 11, 1))

invalid_complexities = 0
invalid_ratings = 0


def model_as_judge(index, system_prompt, user_prompt, answer):
    global invalid_ratings

    chat_response = query_model(
        [
            {
                "role": "system",
                "content": 'Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user request displayed below. Your evaluation should consider factors such as the following all the settings in the system prompt, correspondences to the context of the user, the helpfulness, relevance and accuracy. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example:"Rating: [[6]]".',
            },
            {
                "role": "user",
                "content": f"""
                [Instructions for Assistant]
                {system_prompt}
                [End of Instructions for Assistant]

                [Question]
                {user_prompt}
                [End of Question]

                [The Start of Assistant’s Answer]
                {answer}
                [The End of Assistant’s Answer]
                """,
            },
        ]
    )
    response = chat_response.choices[0].message.content
    # print(response)

    try:
        rating = re.search("\\[\\[(\\d+?)\\]\\]", response).group(1)
        # print(rating)
        rating_int = int(rating)
        if rating_int in ratings:
            df.at[index, "masj_rating"] = rating_int
        else:
            invalid_ratings += 1
    except:
        print(f"Could not extract rating from response:\n{response}\n")
        invalid_ratings += 1


@repeat_if_hit_api_limit
def estimate_complextiy_with_model(index, system_prompt, user_prompt):
    global invalid_complexities

    chat_response = query_model(
        [
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": f"""
                [Question Start]
                {user_prompt}
                [Question End]
                """,
            },
        ]
    )
    response = chat_response.choices[0].message.content
    # print(response)

    try:
        complexity = re.search("\\[\\[(.+?)\\]\\]", response).group(1)
        # print(complexity)

        if complexity in difficulty:
            df.at[index, "masj_complexity"] = complexity
        else:
            invalid_complexities += 1
    except:
        print(f"Could not extract complexity from response:\n{response}\n")
        invalid_complexities += 1

    return response


DUMP_EVERY = 100


def estimate_dataset(df, get_question_from_row, get_options_from_row, out_filename):
    if "masj_complexity" not in df.columns:
        df["masj_complexity"] = ""
    if "masj_rating" not in df.columns:
        df["masj_rating"] = 0

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        if df.at[index, "masj_complexity"] in difficulty and df.at[index, "masj_rating"] in ratings:
            continue

        complexity_system_prompt = f'You are an expert in the topic of the question. Please act as an impartial judge and evaluate the complexity of the multiple-choice question with options below. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must not answer the question. You must rate the question complexity by strictly following the scale: {", ".join(difficulty)}. You must return the complexity by strictly following this format: "[[complexity]]", for example: "Complexity: [[middle_school]]".'
        complexity_user_prompt = prompt.get_user_prompt(get_question_from_row(row), get_options_from_row(row))

        response_complexity = estimate_complextiy_with_model(index, complexity_system_prompt, complexity_user_prompt)
        wait()

        model_as_judge(index, complexity_system_prompt, complexity_user_prompt, response_complexity)
        wait()

        if index % DUMP_EVERY == 0:
            df.to_csv(out_filename, sep="\t", quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\", index=False)
            print(f"Over {index} iterations we hit {api_limit_hits} API limits")

    df.to_csv(out_filename, sep="\t", quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\", index=False)
    print(
        f"Processed dataset {out_filename}. Total entries: {df.shape[0]}. Invalid complexities: {invalid_complexities}. Invalid ratings: {invalid_ratings}"
    )
    return df


ORIGINAL_DATASET = "../data/mmlu_pro_stem"
original_filename = f"{ORIGINAL_DATASET}.tsv"
out_filename = f"{ORIGINAL_DATASET}_w_maj_complexity.tsv"

if os.path.isfile(out_filename):
    df = pd.read_csv(
        out_filename,
        sep="\t",
        header=0,
        quoting=csv.QUOTE_NONE,
        quotechar="",
        escapechar="\\",
    )
else:
    df = pd.read_csv(
        original_filename,
        sep="\t",
        header=0,
    )
# df = df.head(10)


estimate_dataset(
    df=df,
    get_question_from_row=lambda row: row["question"],
    get_options_from_row=lambda row: ast.literal_eval(row["options"]),
    out_filename=out_filename,
)

 58%|█████▊    | 7011/12032 [01:08<03:07, 26.80it/s] 

Could not extract rating from response:
The assistant's response correctly identifies the concepts required to solve the problem, namely thermodynamics and calculus. The question involves a polytropic process described by the relation \( PV^{1.4} = \text{constant} \), which is typically covered in undergraduate-level physics or engineering courses. The need to integrate pressure with respect to volume to calculate work done indicates a level of mathematical complexity suitable for undergraduate students. The explanation is clear, relevant, and accurately reflects the complexity of the question.

Complexity: [[undergraduate]]



 58%|█████▊    | 7018/12032 [01:46<12:41,  6.58it/s]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The problem involves understanding and applying fundamental principles of aerodynamics, specifically the concepts of lift and drag, which are typically covered in undergraduate-level engineering or physics courses. Additionally, the question requires the application of the lift equation and unit conversions, which are skills expected at the undergraduate level.

Complexity: [[undergraduate]]



 59%|█████▊    | 7058/12032 [05:00<6:44:43,  4.88s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the question's complexity. The response highlights that the question requires an understanding of educational psychology and classroom dynamics, which are typically introduced in high school psychology courses or advanced educational courses. The response also notes that the question asks for a judgment call on the least relevant factor, indicating a level of critical thinking and application of knowledge.

However, the question's specific focus on a first-grade student's behavior and the need to evaluate the relevance of various factors to this behavior suggest that it may require a more advanced understanding of educational psychology and classroom management than what is typically covered in high school. This type of question is more likely to be encountered in undergraduate courses in education or psychology, where students are expected to apply theoretical knowledge to spec

 59%|█████▉    | 7101/12032 [09:13<7:32:43,  5.51s/it] 

Over 7100 iterations we hit 0 API limits


 60%|█████▉    | 7201/12032 [18:35<9:00:36,  6.71s/it] 

Over 7200 iterations we hit 0 API limits


 60%|█████▉    | 7211/12032 [19:25<6:49:22,  5.09s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the steps required to solve the problem, highlighting the mathematical operations and financial concepts involved. The response effectively breaks down the question into its component parts, demonstrating an understanding of the complexity of the task. The explanation is relevant, accurate, and helpful in understanding the problem's requirements.

Rating: 9



 60%|██████    | 7237/12032 [21:57<8:13:55,  6.18s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge required to answer the question. The response highlights that the question assumes an understanding of assembly language and its relationship to machine code, which is typically covered in undergraduate computer science or related programs. The detailed options require careful consideration, indicating a level of complexity that goes beyond high school but is generally within the scope of undergraduate studies.

Complexity: [[undergraduate]]



 60%|██████    | 7243/12032 [22:28<6:30:34,  4.89s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise evaluation of the question's complexity. The explanation highlights the specific medical knowledge required to answer the question, including understanding the symptoms and diagnostic imaging findings related to Graves' disease. The response also notes the need to differentiate among various potential diagnoses, which adds to the question's complexity.

The assistant correctly identifies that this level of medical knowledge and diagnostic reasoning is typically expected at the undergraduate level in medical education.

Complexity: [[undergraduate]]



 60%|██████    | 7262/12032 [24:14<8:29:25,  6.41s/it]

Could not extract rating from response:
The assistant's response provides a clear and objective evaluation of the question's complexity. The explanation highlights the need for significant medical knowledge to analyze the patient's history, symptoms, and medications, which is accurate given the context of the question. The response also correctly identifies that the question requires an understanding of various medical conditions and their potential symptoms or side effects, which is relevant to the complexity assessment.

However, I am not here to evaluate the assistant's response. I am here to evaluate the complexity of the multiple-choice question.

The question describes a complex medical scenario involving a patient with a history of rheumatoid arthritis experiencing new symptoms such as drop attacks, fatigue, and weight loss. The question requires a deep understanding of various medical conditions, their symptoms, and potential side effects of medications. The options provided co

 61%|██████    | 7301/12032 [28:13<7:50:32,  5.97s/it] 

Over 7300 iterations we hit 0 API limits


 61%|██████    | 7320/12032 [29:45<5:44:05,  4.38s/it]

Could not extract rating from response:
The assistant's response effectively evaluates the complexity of the question by considering the context and the level of knowledge required to answer it. The question deals with anthropological or archaeological concepts, specifically related to forager societies, which are typically covered in undergraduate studies. The response is helpful, relevant, and accurate in assessing the complexity of the question.

**Complexity of the question:** [[undergraduate]]



 61%|██████    | 7349/12032 [32:17<6:26:29,  4.95s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge and skills required to solve the problem. The question involves understanding basic principles of bankruptcy and proportionate distribution, as well as performing simple mathematical calculations. This level of understanding is typically expected at the high school level, where students are introduced to real-world applications of mathematics and basic financial concepts.

Complexity: [[high_school]]



 61%|██████▏   | 7394/12032 [36:45<5:59:48,  4.65s/it] 

Could not extract rating from response:
The assistant's response correctly identifies the key elements required to understand and answer the question. The question involves basic financial literacy, specifically understanding life insurance policies, premium payments, and death benefits. These are topics typically introduced in high school personal finance or mathematics courses. The question does not require advanced mathematical calculations or specialized knowledge beyond what is generally taught at the high school level.

Complexity: [[high_school]]



 62%|██████▏   | 7401/12032 [37:26<7:54:55,  6.15s/it]

Over 7400 iterations we hit 0 API limits


 62%|██████▏   | 7408/12032 [38:09<7:54:24,  6.16s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the medical knowledge required to assess the risks associated with combining nicotine patches and bupropion therapy for smoking cessation. The response highlights the need to understand the pharmacological effects of these treatments, their potential side effects, and the relevance of the patient's medical history. This level of detail and medical knowledge is typically covered in undergraduate medical or health sciences programs.

Rating: [[undergraduate]]



 62%|██████▏   | 7424/12032 [39:44<7:11:21,  5.62s/it]

Could not extract rating from response:
The response provided by the AI assistant is generally well-reasoned and relevant to the context of the user's request. The assistant correctly identifies that the question presents a complex medical scenario requiring an understanding of multiple medical specialties. The explanation highlights the need for clinical knowledge to determine the most appropriate next step in management, which is accurate and helpful.

However, the complexity rating of "[[undergraduate]]" might be a bit too conservative. Given the specific details of the patient's symptoms, vital signs, and the need to apply advanced clinical knowledge to determine the next step in management, this question might be more appropriately rated at a higher level of complexity. The scenario involves critical thinking and decision-making skills that are typically honed in more advanced stages of medical education.

Rating: [[postgraduate]]



 62%|██████▏   | 7444/12032 [41:40<9:09:10,  7.18s/it]

Could not extract complexity from response:
This question involves understanding the principles of contract law, specifically the concepts of contract integration, merger clauses, and the parol evidence rule. To answer this question correctly, one must understand how these principles interact when determining whether oral agreements can be admitted as evidence in court. This level of legal analysis is typically taught in undergraduate law courses or business law courses, which are often part of the undergraduate curriculum.



 62%|██████▏   | 7501/12032 [46:55<7:32:56,  6.00s/it]

Over 7500 iterations we hit 0 API limits


 63%|██████▎   | 7523/12032 [48:55<6:45:35,  5.40s/it]

Could not extract rating from response:
The assistant's response provides a clear and objective evaluation of the question's complexity. The explanation highlights the specific topics and skills required to answer the question, such as understanding overhead distribution, ratios, and proportions, which are typically covered in accounting and business courses. The response also notes that the question involves mathematical calculations and an understanding of how overhead costs are allocated, which is more advanced than middle school or high school level curriculum. This explanation is helpful, relevant, and accurate, and it effectively corresponds to the context of the user's request.

Rating: 10

Complexity: [[undergraduate]]



 63%|██████▎   | 7601/12032 [56:00<7:02:03,  5.72s/it]

Over 7600 iterations we hit 0 API limits


 64%|██████▍   | 7701/12032 [1:05:27<5:55:23,  4.92s/it] 

Over 7700 iterations we hit 0 API limits


 65%|██████▍   | 7801/12032 [1:14:23<5:36:39,  4.77s/it]

Over 7800 iterations we hit 0 API limits


 65%|██████▍   | 7820/12032 [1:15:57<5:42:14,  4.88s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the question's requirements and the underlying theory behind the code-based phonic approach to reading instruction. The response accurately highlights the need to analyze each sentence option based on the given phonic sounds, which requires a good understanding of phonics and early reading instruction methods. This level of analysis and application of theoretical knowledge is typically expected at the undergraduate level.

Complexity: [[undergraduate]]



 66%|██████▌   | 7901/12032 [1:23:45<6:29:52,  5.66s/it] 

Over 7900 iterations we hit 0 API limits


 66%|██████▌   | 7951/12032 [1:28:21<5:22:05,  4.74s/it]

Could not extract rating from response:
The explanation provided by the assistant is clear and accurate. It correctly identifies the key concepts involved in solving the problem, such as economic equivalent uniform annual cost comparisons, time value of money, and depreciation. The assistant also accurately describes the steps needed to solve the problem, including calculating present values, converting them to equivalent uniform annual costs, and comparing them. This explanation is helpful and relevant to the context of the user's request.

The assistant correctly rates the complexity of the question as [[undergraduate]].



 66%|██████▋   | 7978/12032 [1:30:32<5:40:21,  5.04s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the cognitive demands and concepts required to address the question. The question indeed involves understanding psychological concepts related to interests, career success, and persistence, which are typically covered in undergraduate-level psychology or education courses. The response objectively evaluates the complexity of the question without providing an answer, adhering to the instructions given.

Rating the complexity of the question on the given scale:

Complexity: [[undergraduate]]



 66%|██████▋   | 8001/12032 [1:32:46<7:07:37,  6.37s/it]

Over 8000 iterations we hit 0 API limits


 67%|██████▋   | 8003/12032 [1:33:13<46:55,  1.43it/s]  


KeyboardInterrupt: 