In [25]:
import os
from mistralai import Mistral

api_key = os.environ["MISTRAL_API_KEY"]
model = "mistral-large-2411"

client = Mistral(
    api_key=api_key,
)

# Check API is alive
chat_response = client.chat.complete(
    model=model,
    messages=[
        {
            "role": "user",
            "content": "What is the best French cheese?",
        },
    ],
    max_tokens=10,
)
print(chat_response.model, chat_response.choices[0].message.content)

mistral-large-2411 Determining the "best" French cheese can


In [26]:
from openai import RateLimitError
from mistralai import SDKError
from time import sleep


def wait(duration=1.5):
    sleep(duration)


def repeat_if_hit_api_limit(f):  # (1)
    def wrapper(*args, **kw):  # (2)
        while True:
            try:
                return f(*args, **kw)
            except RateLimitError:
                wait()
            except SDKError as e:
                if e.status_code == 429:
                    wait()
                else:
                    raise e
            except Exception as e:
                print("repeat_if_hit_api_limit -> unknown error", e)
                wait(60)

    return wrapper

In [None]:
import os, sys

dir2 = os.path.abspath("")
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

import pandas as pd
import ast
import csv
from tqdm import tqdm
import re
import os.path

import utils.prompt as prompt

import importlib

# Required to purge the module cache and use the latest version after an update
importlib.reload(prompt)

difficulty = ["middle_school", "high_school", "undergraduate", "postgraduate", "phd"]
ratings = list(range(1, 11, 1))

invalid_complexities = 0
invalid_ratings = 0


@repeat_if_hit_api_limit
def model_as_judge(client, model, index, system_prompt, user_prompt, answer):
    global invalid_ratings

    chat_response = client.chat.complete(
        model=model,
        messages=[
            {
                "role": "system",
                "content": 'Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user request displayed below. Your evaluation should consider factors such as the following all the settings in the system prompt, correspondences to the context of the user, the helpfulness, relevance and accuracy. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example:"Rating: [[6]]".',
            },
            {
                "role": "user",
                "content": f"""
                [Instructions for Assistant]
                {system_prompt}
                [End of Instructions for Assistant]

                [Question]
                {user_prompt}
                [End of Question]

                [The Start of Assistant’s Answer]
                {answer}
                [The End of Assistant’s Answer]
                """,
            },
        ],
    )
    response = chat_response.choices[0].message.content
    # print(response)

    try:
        rating = re.search("\\[\\[(\\d+?)\\]\\]", response).group(1)
        # print(rating)
        rating_int = int(rating)
        if rating_int in ratings:
            df.at[index, "masj_rating"] = rating_int
        else:
            invalid_ratings += 1
    except:
        print(f"Could not extract rating from response:\n{response}\n")
        invalid_ratings += 1


@repeat_if_hit_api_limit
def estimate_complextiy_with_model(client, model, index, system_prompt, user_prompt):
    global invalid_complexities

    chat_response = client.chat.complete(
        model=model,
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": f"""
                [Question Start]
                {user_prompt}
                [Question End]
                """,
            },
        ],
    )
    response = chat_response.choices[0].message.content
    # print(response)

    try:
        complexity = re.search("\\[\\[(.+?)\\]\\]", response).group(1)
        # print(complexity)

        if complexity in difficulty:
            df.at[index, "masj_complexity"] = complexity
        else:
            invalid_complexities += 1
    except:
        print(f"Could not extract complexity from response:\n{response}\n")
        invalid_complexities += 1

    return response


DUMP_EVERY = 100


def estimate_dataset(df, client, get_question_from_row, get_options_from_row, out_filename):
    if "masj_complexity" not in df.columns:
        df["masj_complexity"] = ""
    if "masj_rating" not in df.columns:
        df["masj_rating"] = 0

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        if df.at[index, "masj_complexity"] in difficulty and df.at[index, "masj_rating"] in ratings:
            continue

        complexity_system_prompt = f'You are an expert in the topic of the question. Please act as an impartial judge and evaluate the complexity of the multiple-choice question with options below. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must not answer the question. You must rate the question complexity by strictly following the scale: {", ".join(difficulty)}. You must return the complexity by strictly following this format: "[[complexity]]", for example: "Complexity: [[middle_school]]".'
        complexity_user_prompt = prompt.get_user_prompt(get_question_from_row(row), get_options_from_row(row))

        response_complexity = estimate_complextiy_with_model(
            client, model, index, complexity_system_prompt, complexity_user_prompt
        )
        wait()

        model_as_judge(client, model, index, complexity_system_prompt, complexity_user_prompt, response_complexity)
        wait()

        if index % DUMP_EVERY == 0:
            df.to_csv(out_filename, sep="\t", quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\", index=False)

    df.to_csv(out_filename, sep="\t", quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\", index=False)
    print(
        f"Processed dataset {out_filename}. Total entries: {df.shape[0]}. Invalid complexities: {invalid_complexities}. Invalid ratings: {invalid_ratings}"
    )
    return df


ORIGINAL_DATASET = "../data/mmlu_pro_stem"
original_filename = f"{ORIGINAL_DATASET}.tsv"
out_filename = f"{ORIGINAL_DATASET}_w_maj_complexity.tsv"

df = pd.read_csv(
    out_filename if os.path.isfile(out_filename) else original_filename,
    sep="\t",
    header=0,
    quoting=csv.QUOTE_NONE,
    quotechar="",
    escapechar="\\",
)
# df = df.head(10)


estimate_dataset(
    df=df,
    client=client,
    get_question_from_row=lambda row: row["question"],
    get_options_from_row=lambda row: ast.literal_eval(row["options"]),
    out_filename=out_filename,
)

 24%|██▍       | 2904/12032 [04:06<1:19:13,  1.92it/s]

Could not extract complexity from response:
This question is assessing knowledge of chemical bonding and the octet rule, which states that atoms tend to form bonds so that they each have eight electrons in their valence shells. To answer this question, one needs to understand how to draw Lewis structures and count electrons around the central atom. This topic is typically introduced in high school chemistry courses, but may also be reviewed in undergraduate general chemistry. The addition of the non-standard compounds requires a deeper understanding of octet rule limitations.



 25%|██▍       | 2950/12032 [10:31<21:44:21,  8.62s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge and skills required to solve the given problem. The response highlights the need to understand and apply the divergence theorem, compute the divergence of a vector field, set up and evaluate a volume integral, and understand three-dimensional geometry. These are all topics typically covered in undergraduate-level mathematics courses, specifically in multivariable calculus or advanced calculus. The response is helpful, relevant, and accurate in its assessment of the question's complexity.

Rating: 10



 25%|██▍       | 3000/12032 [18:01<20:46:42,  8.28s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the question's requirements and the mathematical concept being tested. The question involves understanding rounding rules, specifically rounding to the nearest hundred, which is typically covered in middle school mathematics curricula. The response accurately identifies that the question requires evaluating pairs of numbers to determine which ones round to 1,500 when rounded to the nearest hundred. This evaluation is straightforward and does not require advanced mathematical knowledge beyond basic rounding principles.

Complexity: [[middle_school]]



 25%|██▍       | 3001/12032 [18:08<20:06:41,  8.02s/it]

Could not extract rating from response:
The assistant's response effectively evaluates the complexity of the question by considering the mathematical concepts involved. The question requires understanding how to handle subtraction with negative numbers, specifically the concept that subtracting a negative is equivalent to adding a positive. This concept is usually introduced and reinforced in middle school mathematics.

The explanation is clear, concise, and directly relevant to the question's content. It accurately reflects the educational level at which this topic is typically taught.

Complexity: [[middle_school]]



 26%|██▌       | 3080/12032 [29:52<20:51:51,  8.39s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge required to address the question. The response highlights that understanding the question involves knowledge of fiscal policy, aggregate demand, and macroeconomic principles, which are typically covered in undergraduate-level economics courses. The assistant objectively evaluates the complexity of the question without answering it, adhering to the instructions provided.

Rating: Complexity: [[undergraduate]]



 26%|██▌       | 3131/12032 [37:49<21:24:16,  8.66s/it]

Could not extract rating from response:
The assistant's response provides a clear and concise explanation of the knowledge and calculations required to answer the question. The response correctly identifies that the question involves understanding the straight-line method of depreciation, which is typically covered in high school accounting or finance courses. The response also mentions the mathematical operations needed (basic arithmetic and percentages), which are generally within the scope of high school mathematics.

The assistant's evaluation is objective and relevant to the question's context, providing a reasonable assessment of the question's complexity.

Rating: 9/10

Complexity: [[high_school]]



 28%|██▊       | 3352/12032 [1:09:04<24:23:36, 10.12s/it]