In [1]:
import os

from mistralai import Mistral

api_keys = os.environ["MISTRAL_API_KEYS"]
model = "mistral-large-2411"

clients = [
    Mistral(
        api_key=api_key,
    )
    for api_key in api_keys.split(",")
]

for client in clients:
    # Check API is alive
    chat_response = client.chat.complete(
        model=model,
        messages=[
            {
                "role": "user",
                "content": "What is the best French cheese?",
            },
        ],
        max_tokens=10,
    )
    print(chat_response.model, chat_response.choices[0].message.content)

mistral-large-2411 Choosing the "best" French cheese can be
mistral-large-2411 Choosing the "best" French cheese can be
mistral-large-2411 Determining the "best" French cheese can


In [9]:
from time import sleep

from mistralai import SDKError
from openai import RateLimitError

SLEEP_DURATION = 1.2
if len(clients) == 2:
    SLEEP_DURATION = 0.5
if len(clients) >= 3:
    SLEEP_DURATION = 0.2

print("Sleep duration:", SLEEP_DURATION)


def wait(duration=SLEEP_DURATION):
    sleep(duration)


api_limit_hits_by_client_ids = {}


def init_api_limits() -> None:
    global api_limit_hits_by_client_ids

    api_limit_hits_by_client_ids = {}
    for i in range(len(clients)):
        api_limit_hits_by_client_ids[i] = 0


request_id = 0


def repeat_if_hit_api_limit(f):  # (1)
    def wrapper(*args, **kw):  # (2)
        global api_limit_hits_by_client_ids

        while True:
            try:
                return f(*args, **kw)
            except RateLimitError:
                client_id = request_id % len(clients)
                api_limit_hits_by_client_ids[client_id] += 1

                total_hits = 0
                for value in api_limit_hits_by_client_ids.values():
                    total_hits += value

                if (total_hits % 10) == 0:
                    print(f"API limit hit {total_hits} times. Details: {api_limit_hits_by_client_ids}")
                wait(2)
            except SDKError as e:
                if e.status_code == 429:
                    client_id = request_id % len(clients)
                    api_limit_hits_by_client_ids[client_id] += 1

                    total_hits = 0
                    for value in api_limit_hits_by_client_ids.values():
                        total_hits += value

                    if (total_hits % 10) == 0:
                        print(f"API limit hit {total_hits} times. Details: {api_limit_hits_by_client_ids}")
                    wait(2)
                else:
                    raise e
            except Exception as e:
                print("repeat_if_hit_api_limit -> unknown error", e)
                wait(60)

    return wrapper


@repeat_if_hit_api_limit
def query_model(messages):
    global request_id
    # print(request_id % len(clients))
    client = clients[request_id % len(clients)]
    request_id += 1
    response = client.chat.complete(model=model, messages=messages)
    return response

Sleep duration: 0.2


In [10]:
import os
import sys

dir2 = os.path.abspath("")
dir1 = os.path.dirname(dir2)
if dir1 not in sys.path:
    sys.path.append(dir1)

import importlib

import utils.prompt as prompt

# Required to purge the module cache and use the latest version after an update
importlib.reload(prompt)

<module 'utils.prompt' from '/Users/aigoncharov/dev/sktech/phi-4/utils/prompt.py'>

In [None]:
import csv
import os.path
import re

import pandas as pd
from tqdm import tqdm

import utils.prompt as prompt

# difficulty = ["middle_school", "high_school", "undergraduate", "postgraduate", "phd"]
ratings = list(range(1, 11, 1))

invalid_complexities = 0
invalid_ratings = 0

FIELD_NUM_COMPLEXITY = "masj_num_complexity"
FIELD_NUM_RATING = "masj_num_rating"
FIELD_NOMINAL_COMPLEXITY = "masj_nominal_complexity"
FIELD_NOMINAL_RATING = "masj_nominal_rating"


def model_as_judge(df, index, result_field_name, system_prompt, user_prompt, answer):
    global invalid_ratings

    chat_response = query_model(
        [
            {
                "role": "system",
                "content": 'Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user request displayed below. Your evaluation should consider factors such as the following all the settings in the system prompt, correspondences to the context of the user, the helpfulness, relevance and accuracy. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example:"Rating: [[6]]".',
            },
            {
                "role": "user",
                "content": f"""
                [Instructions for Assistant]
                {system_prompt}
                [End of Instructions for Assistant]

                [Question]
                {user_prompt}
                [End of Question]

                [The Start of Assistant’s Answer]
                {answer}
                [The End of Assistant’s Answer]

                You must rate the assistant's response on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example:"Rating: [[6]]"
                """,
            },
        ]
    )
    response = chat_response.choices[0].message.content
    print(response)

    try:
        rating = re.search("\\[\\[(\\d+?)\\]\\]", response).group(1)
        # print(rating)
        rating_int = int(rating)
        if rating_int in ratings:
            df.at[index, result_field_name] = rating_int
        else:
            invalid_ratings += 1
    except:
        print(f"Could not extract rating from response:\n{response}\n")
        invalid_ratings += 1


def estimate_complextiy_with_model(system_prompt, user_prompt):
    global invalid_complexities

    chat_response = query_model(
        [
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": f"""
                [Question Start]
                {user_prompt}
                [Question End]
                """,
            },
        ]
    )
    response = chat_response.choices[0].message.content
    # print(response)
    return response

    try:
        complexity_str = re.search("\\[\\[(.+?)\\]\\]", response).group(1)
        print(complexity_str)
        return complexity_str
    except:
        print(f"Could not extract complexity from response:\n{response}\n")
        invalid_complexities += 1

    return response


def validate_numerical_complexity(df, index, response):
    global invalid_complexities
    try:
        complexity_str = re.search("\\[\\[(.+?)\\]\\]", response).group(1)
        complexity = float(complexity_str)
        assert complexity >= 0.0 and complexity <= 1.0
        df.at[index, FIELD_NUM_COMPLEXITY] = complexity
    except:
        print(f"Could not extract numerical complexity response:\n{response}\n")
        invalid_complexities += 1


def validate_nominal_complexity(df, index, valid_complexities, response):
    global invalid_complexities

    try:
        complexity_str = re.search("\\[\\[(.+?)\\]\\]", response).group(1)
        assert complexity_str in valid_complexities
        df.at[index, FIELD_NOMINAL_COMPLEXITY] = complexity_str
    except:
        print(f"Could not extract nominal complexity response:\n{response}\n")
        invalid_complexities += 1


DUMP_EVERY = 50


def estimate_dataset(in_filename, out_filename, get_question_from_row, get_options_from_row, original_separators=True):
    if os.path.isfile(out_filename):
        df = pd.read_csv(
            out_filename,
            sep="\t",
            header=0,
            quoting=csv.QUOTE_NONE,
            quotechar="",
            escapechar="\\",
        )
    else:
        if original_separators:
            df = pd.read_csv(
                in_filename,
                sep="\t",
                header=0,
            )
        else:
            df = pd.read_csv(
                in_filename,
                sep="\t",
                header=0,
                quoting=csv.QUOTE_NONE,
                quotechar="",
                escapechar="\\",
            )

    global invalid_complexities
    global invalid_ratings
    invalid_complexities = 0
    invalid_ratings = 0
    init_api_limits()

    if FIELD_NUM_COMPLEXITY not in df.columns:
        df[FIELD_NUM_COMPLEXITY] = -1.0
    if FIELD_NUM_RATING not in df.columns:
        df[FIELD_NUM_RATING] = 0
    if FIELD_NOMINAL_COMPLEXITY not in df.columns:
        df[FIELD_NOMINAL_COMPLEXITY] = ""
    if FIELD_NOMINAL_RATING not in df.columns:
        df[FIELD_NOMINAL_RATING] = 0

    meaningful_iteration = 0
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        if (
            isinstance(df.at[index, FIELD_NUM_COMPLEXITY], float)
            and df.at[index, FIELD_NUM_COMPLEXITY] >= 0.0
            and df.at[index, FIELD_NUM_COMPLEXITY] <= 1.0
            and df.at[index, FIELD_NUM_RATING] in ratings
            and df.at[index, FIELD_NOMINAL_COMPLEXITY] in prompt.valid_nominal_complexities
            and df.at[index, FIELD_NOMINAL_RATING] in ratings
        ):
            continue

        meaningful_iteration += 1

        complexity_user_prompt = prompt.get_user_prompt(get_question_from_row(row), get_options_from_row(row))
        # print(complexity_user_prompt)

        response_num_complexity = estimate_complextiy_with_model(
            prompt.estimate_numerical_complexity_system_prompt, complexity_user_prompt
        )
        validate_numerical_complexity(df, index, response_num_complexity)
        wait()

        model_as_judge(
            df,
            index,
            FIELD_NUM_RATING,
            prompt.estimate_numerical_complexity_system_prompt,
            complexity_user_prompt,
            response_num_complexity,
        )
        wait()

        response_nominal_complexity = estimate_complextiy_with_model(
            prompt.estimate_nominal_complexity_system_prompt, complexity_user_prompt
        )
        validate_nominal_complexity(df, index, prompt.valid_nominal_complexities, response_nominal_complexity)
        wait()

        model_as_judge(
            df,
            index,
            FIELD_NOMINAL_RATING,
            prompt.estimate_nominal_complexity_system_prompt,
            complexity_user_prompt,
            response_nominal_complexity,
        )
        wait()

        if meaningful_iteration % DUMP_EVERY == 0:
            df.to_csv(out_filename, sep="\t", quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\", index=False)
            total_hits = 0
            for value in api_limit_hits_by_client_ids.values():
                total_hits += value
            print(f"Over {meaningful_iteration} iterations we hit {total_hits} API limits")

    df.to_csv(out_filename, sep="\t", quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\", index=False)
    print(
        f"Processed dataset {out_filename}. Total entries: {df.shape[0]}. Invalid complexities: {invalid_complexities}. Invalid ratings: {invalid_ratings}"
    )
    return df

In [23]:
# # MMLU
# import ast

# estimate_dataset(
#     in_filename="../data/mmlu_pro_stem_w_maj_w_entropyphi4.tsv",
#     out_filename="../data/mmlu_pro_stem_w_numerical_maj_w_entropyphi4.tsv",
#     get_question_from_row=lambda row: row["question"],
#     get_options_from_row=lambda row: ast.literal_eval(row["options"]),
# )

In [24]:
# # ARC CH


# def get_options_arc(row):
#     try:
#         options_len = int(row["leng"])
#         options_str = row["text"]
#         options_str_without_newline = options_str.replace("\n", "")
#         options_str_without_brackets = options_str_without_newline[1:-1]
#         options_split = options_str_without_brackets.split("' '")
#         # Remove leading and trailing quotes from first and last options
#         options_split[0] = options_split[0][1:]
#         options_split[-1] = options_split[-1][:-1]
#         # print(options_split, options_len)
#         assert len(options_split) == options_len
#         for option in options_split:
#             assert len(option) > 0
#         return options_split
#     except AssertionError as e:
#         print(f"get_options_arc: {row['id']} -> AssertionError: {e}")
#         raise e


# estimate_dataset(
#     in_filename="../data/arc_ch_train_w_maj_complexity.tsv",
#     out_filename="../data/arc_ch_train_w_numerical_maj_complexity.tsv",
#     get_question_from_row=lambda row: row["question"],
#     get_options_from_row=get_options_arc,
# )
# estimate_dataset(
#     in_filename="../data/arc_ch_test_w_maj_complexity.tsv",
#     out_filename="../data/arc_ch_test_w_numerical_maj_complexity.tsv",
#     get_question_from_row=lambda row: row["question"],
#     get_options_from_row=get_options_arc,
# )
# estimate_dataset(
#     in_filename="../data/arc_ch_validation_w_maj_complexity.tsv",
#     out_filename="../data/arc_ch_validation_w_numerical_maj_complexity.tsv",
#     get_question_from_row=lambda row: row["question"],
#     get_options_from_row=get_options_arc,
# )

In [25]:
# # SCIQ
# from random import shuffle


# def get_options_sciq(row):
#     try:
#         options_len = 4
#         correct_option = row["correct"]
#         other_options = [row["incorrect1"], row["incorrect2"], row["incorrect3"]]
#         all_options = [correct_option] + other_options
#         shuffle(all_options)

#         assert len(all_options) == options_len
#         for option in all_options:
#             assert len(option) > 0
#         return all_options
#     except AssertionError as e:
#         print(f"get_options_sciq: {row['id']} -> AssertionError: {e}")
#         raise e


# estimate_dataset(
#     in_filename="../data/sciq_train.tsv",
#     out_filename="../data/sciq_train_w_maj_complexity.tsv",
#     get_question_from_row=lambda row: row["question"],
#     get_options_from_row=get_options_sciq,
# )
# estimate_dataset(
#     in_filename="../data/sciq_test.tsv",
#     out_filename="../data/sciq_test_w_maj_complexity.tsv",
#     get_question_from_row=lambda row: row["question"],
#     get_options_from_row=get_options_sciq,
# )
# estimate_dataset(
#     in_filename="../data/sciq_validation.tsv",
#     out_filename="../data/sciq_validation_w_maj_complexity.tsv",
#     get_question_from_row=lambda row: row["question"],
#     get_options_from_row=get_options_sciq,
# )


In [None]:
# GPQA
from random import shuffle


def get_options_gpqa(row):
    try:
        options_len = 4
        correct_option = row["correct"]
        other_options = [row["incorrect1"], row["incorrect2"], row["incorrect3"]]
        all_options = [correct_option] + other_options
        shuffle(all_options)

        # print(all_options)

        assert len(all_options) == options_len
        for option in all_options:
            assert len(option) > 0
        return all_options
    except AssertionError as e:
        print(f"get_options_sciq: {row['id']} -> AssertionError: {e}")
        raise e


estimate_dataset(
    in_filename="../data/gpqa_train.tsv",
    out_filename="../data/gpqa_w_both_maj_complexities_v3.tsv",
    get_question_from_row=lambda row: row["question"],
    get_options_from_row=get_options_gpqa,
)

  0%|          | 0/448 [00:00<?, ?it/s]

0.55
The assistant's response is a single numerical value, "[[0.55]]", which is meant to represent the complexity of the given multiple-choice question. The scale provided in the instructions ranges from 0 to 1, with specific intervals corresponding to different educational levels. The response "[[0.55]]" falls within the "graduate" level range (0.5-0.75), indicating that the question is deemed complex and suitable for graduate-level understanding.

The response is concise and directly addresses the task of evaluating the question's complexity. It does not provide any additional information or answer the question itself, which aligns with the instructions given. The rating is relevant to the context and seems accurate based on the detailed and specialized nature of the question.

Taking into account the helpfulness, relevance, and accuracy of the response in the context of the instructions provided:

Rating: [[9]]
graduate
The assistant's response is concise and directly addresses the 

  0%|          | 1/448 [00:15<1:56:06, 15.58s/it]

0.45
The assistant's response is a single number, 0.45, which is meant to represent the complexity of the given question. The question involves understanding quantum states, their energies, and lifetimes, which are typically topics covered in undergraduate physics courses. The response of 0.45 falls within the undergraduate range (0.25-0.5) as per the given scale, which seems appropriate for the context of the question.

However, the response does not provide any explanation or context for how this complexity was determined, which makes it less helpful for understanding the evaluation process. Additionally, the response does not address the actual content of the question or the options provided, which limits its relevance to the user's request.

Considering these factors, the helpfulness, relevance, and accuracy of the response are somewhat limited.

Rating: [[4]]
graduate
The assistant's response is concise and directly addresses the task of evaluating the complexity of the question. 

  0%|          | 2/448 [00:27<1:41:13, 13.62s/it]

0.45
The assistant's response is a single numerical value, "0.45," which is meant to represent the complexity of the given multiple-choice question. The response is concise and directly addresses the task of evaluating the question's complexity. However, it lacks any explanation or justification for why this specific complexity rating was chosen, which makes it difficult to assess the accuracy or relevance of the rating. The helpfulness of the response is limited because it does not provide any context or reasoning that could aid the user in understanding the evaluation process.

Given these factors, the response is relevant to the task but lacks depth and justification.

Rating: [[4]]
graduate
The assistant's response was concise and directly addressed the task of evaluating the complexity of the question. The question involves organic chemistry reactions, including a Grignard reaction and an oxidation, which are typically covered in undergraduate organic chemistry courses. However, t

  1%|          | 3/448 [00:38<1:30:43, 12.23s/it]

0.6
The assistant's response is a single number, "0.6," which is meant to represent the complexity of the question. The response is concise and directly addresses the task of evaluating the complexity of the question. However, it lacks any explanation or justification for why this complexity rating was chosen, which makes it difficult to assess the accuracy or relevance of the rating.

Given the instructions and the context, the response is somewhat helpful but could be significantly improved with additional context or explanation. The lack of explanation makes it hard to judge the accuracy of the complexity rating.

Rating: [[4]]
graduate
The assistant's response is concise and directly addresses the task of evaluating the complexity of the question. The question involves determining the optical activity of several chemical compounds, which requires a good understanding of organic chemistry and stereochemistry. This level of knowledge is typically covered in graduate-level courses, ma

  1%|          | 4/448 [00:55<1:44:36, 14.14s/it]

0.65
The assistant's response is a single numerical value, "0.65," which is supposed to represent the complexity of the question. The task required the assistant to evaluate the complexity of the question and provide a rating based on a specific scale. The response is concise and directly addresses the task, but it lacks any explanation or context, which makes it difficult to assess the reasoning behind the rating. However, the response does provide a rating that falls within the specified range, which is helpful in understanding the perceived complexity of the question.

Rating: [[6]]
graduate
The assistant's response is concise and directly addresses the task of evaluating the complexity of the question. The response provided a complexity rating of "graduate," which aligns with the advanced nature of the question. The question involves concepts from surface science and materials engineering, specifically dealing with contact angles and the Cassie-Baxter state, which are typically cov

  1%|          | 5/448 [01:08<1:42:39, 13.90s/it]

0.65
The assistant was asked to evaluate the complexity of a multiple-choice question related to the area of a pseudosphere given a specific metric. The assistant provided a complexity rating of 0.65, which falls within the "graduate" level range (0.5-0.75).

To evaluate the quality of the assistant's response:

1. **Correspondence to the Context**: The assistant correctly understood the task and provided a complexity rating, which is what was asked for.
2. **Helpfulness**: The response is helpful as it gives a clear complexity rating that aligns with the scale provided in the instructions.
3. **Relevance**: The response is relevant to the request, as it directly addresses the complexity of the question.
4. **Accuracy**: The complexity rating of 0.65 seems reasonable given the advanced nature of the question, which involves understanding pseudospheres and metrics, typically covered in graduate-level courses.

Given these factors, the response is objective, relevant, and accurate.

Rati

  1%|▏         | 6/448 [01:20<1:37:31, 13.24s/it]

0.6
The assistant's response is concise and directly addresses the task of evaluating the complexity of the question. The response provides a numerical complexity rating of 0.6, which falls within the "graduate" level category according to the given scale. This rating seems reasonable given the context of the question, which involves understanding organic chemistry reactions and interpreting NMR spectra, topics typically covered in graduate-level chemistry courses.

However, the assistant did not provide any explanation for the rating, which makes it difficult to assess the thoroughness and accuracy of the evaluation. Additionally, the instructions specified that the assistant should not answer the question, and the assistant correctly adhered to this guideline.

Considering the helpfulness, relevance, and accuracy within the constraints given, the response is partially effective but lacks the explanatory component that would make it more comprehensive.

Rating: [[5]]
graduate
The assi

  2%|▏         | 7/448 [01:33<1:38:19, 13.38s/it]


KeyboardInterrupt: 