In [4]:
import os
from mistralai import Mistral

api_key = os.environ["MISTRAL_API_KEY"]
model = "mistral-large-2411"

client = Mistral(
    api_key=api_key,
)

# Check API is alive
chat_response = client.chat.complete(
    model=model,
    messages=[
        {
            "role": "user",
            "content": "What is the best French cheese?",
        },
    ],
    max_tokens=10,
)
print(chat_response.model, chat_response.choices[0].message.content)

mistral-large-2411 Determining the "best" French cheese can


In [5]:
from openai import RateLimitError
from mistralai import SDKError
from time import sleep


def wait():
    sleep(1.5)


def repeat_if_hit_api_limit(f):  # (1)
    def wrapper(*args, **kw):  # (2)
        while True:
            try:
                return f(*args, **kw)
            except RateLimitError:
                wait()
            except SDKError as e:
                if e.status_code == 429:
                    wait()
                else:
                    raise e

    return wrapper

In [None]:
import os, sys

dir2 = os.path.abspath("")
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

import pandas as pd
import ast
import csv
from tqdm import tqdm
import re

import utils.prompt as prompt

import importlib

# Required to purge the module cache and use the latest version after an update
importlib.reload(prompt)

difficulty = ["middle_school", "high_school", "undergraduate", "postgraduate", "phd"]
ratings = list(range(1, 11, 1))

invalid_complexities = 0
invalid_ratings = 0


@repeat_if_hit_api_limit
def model_as_judge(client, model, index, system_prompt, user_prompt, answer):
    global invalid_ratings

    chat_response = client.chat.complete(
        model=model,
        messages=[
            {
                "role": "system",
                "content": 'Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user request displayed below. Your evaluation should consider factors such as the following all the settings in the system prompt, correspondences to the context of the user, the helpfulness, relevance and accuracy. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example:"Rating: [[6]]".',
            },
            {
                "role": "user",
                "content": f"""
                [Instructions for Assistant]
                {system_prompt}
                [End of Instructions for Assistant]

                [Question]
                {user_prompt}
                [End of Question]

                [The Start of Assistant’s Answer]
                {answer}
                [The End of Assistant’s Answer]
                """,
            },
        ],
    )
    response = chat_response.choices[0].message.content
    # print(response)

    try:
        rating = re.search("\\[\\[(\\d+?)\\]\\]", response).group(1)
        # print(rating)
        rating_int = int(rating)
        if rating_int in ratings:
            df.at[index, "masj_rating"] = rating_int
        else:
            invalid_ratings += 1
    except:
        print(f"Could not extract rating from response:\n{response}\n")
        invalid_ratings += 1


@repeat_if_hit_api_limit
def estimate_complextiy_with_model(client, model, index, system_prompt, user_prompt):
    global invalid_complexities

    chat_response = client.chat.complete(
        model=model,
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": f"""
                [Question Start]
                {user_prompt}
                [Question End]
                """,
            },
        ],
    )
    response = chat_response.choices[0].message.content
    # print(response)

    try:
        complexity = re.search("\\[\\[(.+?)\\]\\]", response).group(1)
        # print(complexity)

        if complexity in difficulty:
            df.at[index, "masj_complexity"] = complexity
        else:
            invalid_complexities += 1
    except:
        print(f"Could not extract complexity from response:\n{response}\n")
        invalid_complexities += 1

    return response


DUMP_EVERY = 100


def estimate_dataset(df, client, get_question_from_row, get_options_from_row, out_filename):
    if "masj_complexity" not in df.columns:
        df["masj_complexity"] = ""
    if "masj_rating" not in df.columns:
        df["masj_rating"] = 0

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        if df.at[index, "masj_complexity"] in difficulty and df.at[index, "masj_rating"] in ratings:
            continue

        complexity_system_prompt = f'You are an expert in the topic of the question. Please act as an impartial judge and evaluate the complexity of the multiple-choice question with options below. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must not answer the question. You must rate the question complexity by strictly following the scale: {", ".join(difficulty)}. You must return the complexity by strictly following this format: "[[complexity]]", for example: "Complexity: [[middle_school]]".'
        complexity_user_prompt = prompt.get_user_prompt(get_question_from_row(row), get_options_from_row(row))

        response_complexity = estimate_complextiy_with_model(
            client, model, index, complexity_system_prompt, complexity_user_prompt
        )
        wait()

        model_as_judge(client, model, index, complexity_system_prompt, complexity_user_prompt, response_complexity)
        wait()

        if index % DUMP_EVERY == 0:
            df.to_csv(out_filename, sep="\t", quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\", index=False)

    df.to_csv(out_filename, sep="\t", quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\", index=False)
    print(
        f"Processed dataset {out_filename}. Total entries: {df.shape[0]}. Invalid complexities: {invalid_complexities}. Invalid ratings: {invalid_ratings}"
    )
    return df


DATASET = "../data/mmlu_pro_stem"
out_filename = f"{DATASET}_w_maj_complexity.tsv"

df = pd.read_csv(out_filename, sep="\t", header=0, quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\")
# df = df.head(10)


estimate_dataset(
    df=df,
    client=client,
    get_question_from_row=lambda row: row["question"],
    get_options_from_row=lambda row: ast.literal_eval(row["options"]),
    out_filename=out_filename,
)

  8%|▊         | 971/12032 [11:52<26:42:57,  8.70s/it]

Could not extract rating from response:
The assistant's response demonstrates a clear understanding of the economic concepts involved in the question. The explanation highlights that the question requires not just knowledge of terms like monopolistic competition and pure competition, but also the ability to apply these concepts to a hypothetical scenario and consider the market dynamics. This level of analysis is typically expected at the undergraduate level in economics.

Complexity: [[undergraduate]]



  8%|▊         | 972/12032 [11:59<25:41:36,  8.36s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the context and requirements of the question. It accurately identifies that the question relates to economics and business studies, specifically the concept of production efficiency. The response highlights that answering the question correctly requires an understanding of different efficiency concepts, which is typically covered at the undergraduate level in economics or business courses. This evaluation is objective and relevant to the context of the question.

Complexity: [[undergraduate]]



  9%|▊         | 1036/12032 [21:05<27:40:18,  9.06s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge required to answer the question. The question indeed involves understanding monetary policy, banking operations, and the role of central banks, which are typically covered in undergraduate economics or finance courses. The response does not answer the question but instead evaluates the complexity, which is appropriate given the instructions.

Rating the complexity of the question: [[undergraduate]].



  9%|▉         | 1094/12032 [29:23<22:57:19,  7.56s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the question's complexity. The response accurately identifies that the question tests knowledge of a fundamental concept in statistics, which is typically introduced at the high school level or in introductory undergraduate statistics courses. The explanation is relevant, accurate, and helpful in understanding the context and complexity of the question.

The assistant correctly rates the complexity of the question as [[high_school]].



  9%|▉         | 1117/12032 [32:58<28:24:30,  9.37s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the principles required to understand and answer the question. The question indeed involves understanding Newton's Second Law of Motion and the concept of slope in a graph, which are typically covered in high school physics. The response also correctly identifies that the student must differentiate between inertial mass and gravitational mass, adding a layer of complexity that is still within the high school curriculum.

Complexity: [[high_school]]



 10%|▉         | 1164/12032 [39:41<25:37:28,  8.49s/it]

Could not extract rating from response:
The assistant's response correctly identifies the core concept needed to solve the problem, which is the physics formula for work done, W = m * g * h. The response also accurately explains the variables involved and the steps required to solve the problem. This question indeed aligns with high school physics curriculum, where students are introduced to such mechanical concepts and calculations.

Complexity: [[high_school]]



 10%|█         | 1231/12032 [49:50<25:40:36,  8.56s/it]

Could not extract complexity from response:
To evaluate the complexity of this question, we need to consider the principles involved and the level of understanding required to solve it. The question deals with the concept of an object falling under gravity, which is typically covered in introductory physics courses. The velocity of an object falling from rest can be determined using the equation derived from kinematics: v² = 2gh.

To solve this question, one needs to:
1. Understand the concept of free fall and the acceleration due to gravity.
2. Apply the kinematic equation to find the final velocity.
3. Perform basic mathematical calculations.

This level of physics and mathematics is typically introduced in high school physics courses. Therefore, the complexity of the question is:

Complexity: "high_school"



 11%|█         | 1312/12032 [1:01:34<25:50:17,  8.68s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The question involves understanding basic Python syntax and the built-in `sum()` function, which is typically covered in introductory programming courses. The task of summing the elements of a list is fundamental and does not require advanced knowledge.

Complexity: [[middle_school]]



 11%|█         | 1334/12032 [1:04:38<22:42:15,  7.64s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The question requires knowledge of reliability engineering concepts, specifically the formula for availability, which involves understanding mean time between failures (MTBF) and mean time to repair (MTTR). This topic is typically covered in undergraduate-level engineering or computer science courses. The calculations involved are straightforward but require a foundational understanding of these concepts.

Complexity: [[undergraduate]]



 11%|█         | 1336/12032 [1:04:55<24:54:52,  8.39s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the question's requirements and the knowledge needed to answer it. The question involves translating a simple English sentence into predicate logic, which requires understanding of variables, predicates, and the conventions of predicate logic. The response accurately identifies that this task is typically covered in undergraduate-level courses, making the complexity rating appropriate.

Rating: [[undergraduate]]



 12%|█▏        | 1437/12032 [1:19:15<23:07:35,  7.86s/it]

Could not extract rating from response:
The assistant's response provides a clear and objective evaluation of the complexity of the multiple-choice question. The explanation highlights the analytical thinking and understanding of logical concepts required to answer the question, which is typically covered in an introductory logic course. The response is helpful, relevant, and accurate in the context of evaluating the question's complexity. The assistant correctly identifies that the question goes beyond basic knowledge recall and involves applying knowledge to a specific scenario, which is characteristic of undergraduate-level complexity.

Complexity: [[undergraduate]]



 12%|█▏        | 1498/12032 [1:28:18<26:49:26,  9.17s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The problem involves understanding transmission line theory and applying the telegrapher's equations, which are typically covered in undergraduate-level electrical engineering courses. The response is clear, concise, and directly addresses the complexity of the question without providing the answer, as instructed.

Complexity: [[undergraduate]]



 13%|█▎        | 1541/12032 [1:34:51<24:55:07,  8.55s/it]

Could not extract rating from response:
The assistant's response correctly identifies the key concepts involved in the question, namely AC motors, pole numbers, slip, and the relationship between these factors. The response also acknowledges the need to apply specific formulas and understand the behavior of motors in different configurations. This evaluation is objective and relevant to the context of the user's request.

The assistant accurately assesses the complexity of the question as [[undergraduate]].



 13%|█▎        | 1569/12032 [1:38:49<23:33:35,  8.11s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the concepts involved in the question, specifically mentioning the Joule-Thomson coefficient and the thermodynamic principles related to throttling processes. The response correctly identifies that understanding these principles is crucial for solving the problem. However, the complexity rating of "high_school" seems to be an underestimation. The calculation of the Joule-Thomson coefficient and the understanding of throttling processes typically fall within the scope of undergraduate-level thermodynamics courses.

Rating: Complexity: [[undergraduate]]



 13%|█▎        | 1615/12032 [1:45:53<22:32:39,  7.79s/it]

Could not extract rating from response:
The assistant's response accurately assesses the complexity of the question. The question involves basic financial concepts (stock prices and discounts), simple arithmetic operations (calculating an average and applying a percentage discount), and converting fractions to decimals. These are all skills typically taught and reinforced in high school mathematics and finance courses. The response is clear, concise, and relevant to the task at hand.

Complexity: [[high_school]]



 13%|█▎        | 1619/12032 [1:46:26<23:51:21,  8.25s/it]

Could not extract rating from response:
This evaluation is not of the AI assistant response to the question, but of the complexity of the multiple-choice question. The assistant correctly identified that the question requires an understanding of basic algebra and business cost structures, specifically fixed and variable costs. The question does not require advanced mathematical concepts or deep economic theory, making it suitable for a high school level of complexity.

Complexity: [[high_school]]



 13%|█▎        | 1621/12032 [1:46:42<23:35:33,  8.16s/it]

Could not extract rating from response:
The explanation provided by the assistant is clear and concise, accurately describing the mathematical concepts required to solve the problem. The question involves understanding percentage increases and performing basic arithmetic, which are typically covered in middle school mathematics curricula. The assistant correctly identifies that the student needs to calculate a 1% increase on the current tax amount, which involves multiplying by 1.01. This evaluation is relevant, accurate, and helpful in understanding the complexity of the question.

Complexity: [[middle_school]]



 14%|█▍        | 1679/12032 [1:54:40<20:36:56,  7.17s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge required to answer the question. The question indeed requires an understanding of tax structures and the concept of regressive taxes, which are typically introduced in high school economics or social studies curricula. The question does not require advanced mathematical calculations or deep theoretical knowledge, making it accessible to a high school student with a basic understanding of economics.

Complexity: [[high_school]]



 14%|█▍        | 1702/12032 [1:57:52<22:12:22,  7.74s/it]