In [1]:
# !pip install plotly kaleido datasets nbformat -U -q

In [58]:
import os

import datasets
import pandas as pd
from dotenv import load_dotenv
from huggingface_hub import login


load_dotenv(override=True)
login(os.getenv("HF_TOKEN"))

pd.set_option("max_colwidth", None)

OUTPUT_DIR = "../../output"

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [59]:
eval_ds = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")["validation"]
eval_ds = eval_ds.rename_columns({"Question": "question", "Final answer": "true_answer", "Level": "task"})
eval_df = pd.DataFrame(eval_ds)


The repository for gaia-benchmark/GAIA contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/gaia-benchmark/GAIA
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.

Using the latest cached version of the module from /Users/aymeric/.cache/huggingface/modules/datasets_modules/datasets/gaia-benchmark--GAIA/ec492fe4320ee795b1aed6bb46229c5f693226b0f1316347501c24b4baeee005 (last modified on Tue May 28 10:04:32 2024) since it couldn't be found locally at gaia-benchmark/GAIA, or remotely on the Hugging Face Hub.


# 1. Load all results

In [284]:
import glob


results = []
for f in glob.glob(f"{OUTPUT_DIR}/validation/*.jsonl"):
    df = pd.read_json(f, lines=True)
    df["agent_name"] = f.split("/")[-1].split(".")[0]
    results.append(df)

result_df = pd.concat(results)
result_df["prediction"] = result_df["prediction"].fillna("No prediction")

In [None]:
import re
from collections import Counter

from scripts.gaia_scorer import check_close_call, question_scorer


result_df["is_correct"] = result_df.apply(lambda x: question_scorer(x["prediction"], x["true_answer"]), axis=1)
result_df["is_near_correct"] = result_df.apply(
    lambda x: check_close_call(x["prediction"], x["true_answer"], x["is_correct"]),
    axis=1,
)

result_df["count_steps"] = result_df["intermediate_steps"].apply(len)


def find_attachment(question):
    matches = eval_df.loc[eval_df["question"].apply(lambda x: x in question), "file_name"]

    if len(matches) == 0:
        return "Not found"
    file_path = matches.values[0]

    if isinstance(file_path, str) and len(file_path) > 0:
        return file_path.split(".")[-1]
    else:
        return "None"


result_df["attachment_type"] = result_df["question"].apply(find_attachment)


def extract_tool_calls(code):
    regex = r"\b(\w+)\("
    function_calls = [el for el in re.findall(regex, code) if el.islower()]

    function_call_counter = Counter(function_calls)
    return function_call_counter


def sum_tool_calls(steps):
    total_count = Counter()
    for step in steps:
        if "llm_output" in step:
            total_count += extract_tool_calls(step["llm_output"])

    return total_count


from datetime import datetime


def get_durations(row):
    # start_datetime = datetime.strptime(row['start_time'], "%Y-%m-%d %H:%M:%S")
    # end_datetime = datetime.strptime(row['end_time'], "%Y-%m-%d %H:%M:%S")

    duration_timedelta = row["end_time"] - row["start_time"]
    return int(duration_timedelta.total_seconds())


result_df["duration"] = result_df.apply(get_durations, axis=1)
# result_df["tool_calls"] = result_df["intermediate_steps"].apply(sum_tool_calls)


Answer lists have different lengths, returning False.



String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to numb

In [286]:
def get_thoughts(x):
    try:
        output = x[0]["task"]
        for y in x[1:]:
            try:
                if "observation" in y:
                    output += y["llm_output"] + "\nObservation:" + y["observation"]
                else:
                    output += y["llm_output"] + r"\Error:" + str(y["error"])
            except Exception:
                pass
        return output
    except Exception:
        return None


result_df["thoughts"] = result_df["intermediate_steps"].apply(lambda x: get_thoughts(x))

In [287]:
result_df["agent_name"].value_counts()

agent_name
07-apr-refacto_system_prompt_noplanningfacts         165
generate-traces-03-apr-noplanning                    165
07-apr-refacto_system_prompt_noplanning              165
07-apr-refacto_system_prompt_noplanning_qwen         164
generate-traces-03-apr                               164
07-apr-refacto_system_prompt                         163
07-apr-refacto_system_prompt_trueplanningfacts       161
07-apr-refacto_system_prompt_trueplanningnofacts     160
07-apr-refacto_system_prompt_trueplanningnewfacts     97
Name: count, dtype: int64

# 2. Inspect specific runs

In [288]:
sel_df = result_df
# sel_df = sel_df.loc[
#     (result_df["agent_name"].isin(list_versions))
# ]
sel_df = sel_df.reset_index(drop=True)
display(sel_df["agent_name"].value_counts())
sel_df = sel_df.drop_duplicates(subset=["agent_name", "question"])
display(sel_df.groupby("agent_name")[["task"]].value_counts())
print("Total length:", len(sel_df), "- is complete:", len(sel_df) == 165)

agent_name
07-apr-refacto_system_prompt_noplanningfacts         165
generate-traces-03-apr-noplanning                    165
07-apr-refacto_system_prompt_noplanning              165
07-apr-refacto_system_prompt_noplanning_qwen         164
generate-traces-03-apr                               164
07-apr-refacto_system_prompt                         163
07-apr-refacto_system_prompt_trueplanningfacts       161
07-apr-refacto_system_prompt_trueplanningnofacts     160
07-apr-refacto_system_prompt_trueplanningnewfacts     97
Name: count, dtype: int64

agent_name                                         task
07-apr-refacto_system_prompt                       2       84
                                                   1       53
                                                   3       26
07-apr-refacto_system_prompt_noplanning            2       86
                                                   1       53
                                                   3       26
07-apr-refacto_system_prompt_noplanning_qwen       2       86
                                                   1       53
                                                   3       25
07-apr-refacto_system_prompt_noplanningfacts       2       86
                                                   1       53
                                                   3       26
07-apr-refacto_system_prompt_trueplanningfacts     2       84
                                                   1       51
                                                   3       26
07-apr-refacto

Total length: 1404 - is complete: False


In [None]:
display("Average score:", sel_df.groupby("agent_name")[["is_correct"]].mean().round(3))
display(
    sel_df.groupby(["agent_name", "task"])[["is_correct", "is_near_correct", "count_steps", "question", "duration"]]
    .agg(
        {
            "is_correct": "mean",
            "is_near_correct": "mean",
            "count_steps": "mean",
            "question": "count",
            "duration": "mean",
        }
    )
    .rename(columns={"question": "count"})
)

'Average score:'

Unnamed: 0_level_0,is_correct
agent_name,Unnamed: 1_level_1
07-apr-refacto_system_prompt,0.356
07-apr-refacto_system_prompt_noplanning,0.358
07-apr-refacto_system_prompt_noplanning_qwen,0.177
07-apr-refacto_system_prompt_noplanningfacts,0.424
07-apr-refacto_system_prompt_trueplanningfacts,0.41
07-apr-refacto_system_prompt_trueplanningnewfacts,0.423
07-apr-refacto_system_prompt_trueplanningnofacts,0.4
generate-traces-03-apr,0.36
generate-traces-03-apr-noplanning,0.394


Unnamed: 0_level_0,Unnamed: 1_level_0,is_correct,is_near_correct,count_steps,count,duration
agent_name,task,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
07-apr-refacto_system_prompt,1,0.471698,0.509434,7.150943,53,80.433962
07-apr-refacto_system_prompt,2,0.345238,0.357143,9.892857,84,147.428571
07-apr-refacto_system_prompt,3,0.153846,0.153846,12.807692,26,197.5
07-apr-refacto_system_prompt_noplanning,1,0.528302,0.54717,8.245283,53,108.54717
07-apr-refacto_system_prompt_noplanning,2,0.313953,0.325581,9.244186,86,168.22093
07-apr-refacto_system_prompt_noplanning,3,0.153846,0.153846,11.846154,26,301.423077
07-apr-refacto_system_prompt_noplanning_qwen,1,0.264151,0.264151,4.792453,53,282.528302
07-apr-refacto_system_prompt_noplanning_qwen,2,0.174419,0.174419,3.965116,86,407.686047
07-apr-refacto_system_prompt_noplanning_qwen,3,0.0,0.0,5.04,25,406.56
07-apr-refacto_system_prompt_noplanningfacts,1,0.471698,0.509434,7.981132,53,132.566038


In [290]:
import plotly.express as px


cumulative_df = (
    (
        sel_df.groupby("agent_name")[["is_correct", "is_near_correct"]]
        .expanding(min_periods=1, axis=0, method="single")
        .agg({"is_correct": "mean", "is_near_correct": "count"})
        .reset_index()
    )
    .copy()
    .rename(columns={"is_near_correct": "index"})
)
cumulative_df["index"] = cumulative_df["index"].astype(int) - 1


def find_question(row):
    try:
        res = sel_df.loc[sel_df["agent_name"] == row["agent_name"], "question"].iloc[row["index"]][:50]
        return res
    except Exception:
        return ""


cumulative_df["question"] = cumulative_df.apply(find_question, axis=1)

px.line(
    cumulative_df,
    color="agent_name",
    x="index",
    y="is_correct",
    hover_data="question",
)

# 3. Dive deeper into one run

In [12]:
sel_df = result_df.loc[result_df["agent_name"] == o1]
print(len(sel_df))

165


### Count errors

In [13]:
import numpy as np


error_types = [
    "AgentParsingError",
    "AgentExecutionError",
    "AgentMaxIterationsError",
    "AgentGenerationError",
]
sel_df[error_types] = 0
sel_df["Count steps"] = np.nan


def count_errors(row):
    if isinstance(row["intermediate_steps"], list):
        row["Count steps"] = len(row["intermediate_steps"])
        for step in row["intermediate_steps"]:
            if isinstance(step, dict) and "error" in step:
                try:
                    row[str(step["error"]["error_type"])] += 1
                except Exception:
                    pass
    return row


sel_df = sel_df.apply(count_errors, axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [14]:
import plotly.express as px


aggregate_errors = (
    sel_df.groupby(["is_correct"])[error_types + ["Count steps"]].mean().reset_index().melt(id_vars=["is_correct"])
)

fig = px.bar(
    aggregate_errors,
    y="value",
    x="variable",
    color="is_correct",
    labels={
        "agent_name": "<b>Model</b>",
        "task": "<b>Level</b>",
        "aggregate_score": "<b>Performance</b>",
        "value": "<b>Average count</b>",
        "eval_score_GPT4": "<b>Score</b>",
    },
)
fig.update_layout(
    height=500,
    width=800,
    barmode="group",
    bargroupgap=0.0,
)
fig.update_traces(textposition="outside")
fig.write_image("aggregate_errors.png", scale=3)
fig.show()

### Inspect result by file extension type

In [15]:
display(
    result_df.groupby(["attachment_type"])[["is_correct", "count_steps", "question"]].agg(
        {"is_correct": "mean", "count_steps": "mean", "question": "count"}
    )
)

Unnamed: 0_level_0,is_correct,count_steps,question
attachment_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,0.423799,4.959725,2185
csv,0.0,7.75,16
docx,0.571429,4.904762,21
jpg,0.142857,5.75,28
jsonld,0.0,6.6,15
mp3,0.48,4.5,50
pdb,0.0,4.444444,18
pdf,0.588235,4.137255,51
png,0.216783,4.412587,143
pptx,0.882353,4.058824,17


# 4. Ensembling methods

In [16]:
counts = result_df["agent_name"].value_counts()
long_series = result_df.loc[result_df["agent_name"].isin(counts[counts > 140].index)]

In [17]:
def majority_vote(df):
    df = df[(df["prediction"] != "Unable to determine") & (~df["prediction"].isna()) & (df["prediction"] != "None")]

    answer_modes = df.groupby("question")["prediction"].agg(lambda x: x.mode()[0]).reset_index()
    first_occurrences = (
        df.groupby(["question", "prediction"]).agg({"task": "first", "is_correct": "first"}).reset_index()
    )
    result = answer_modes.merge(first_occurrences, on=["question", "prediction"], how="left")

    return result


def oracle(df):
    def get_first_correct_or_first_wrong(group):
        correct_answers = group[group["is_correct"]]
        if len(correct_answers) > 0:
            return correct_answers.iloc[0]
        return group.iloc[0]

    result = df.groupby("question").apply(get_first_correct_or_first_wrong)

    return result.reset_index(drop=True)


display((long_series.groupby("agent_name")["is_correct"].mean() * 100).round(2))
print(f"Majority score: {majority_vote(long_series)['is_correct'].mean() * 100:.2f}")
print(f"Oracle score: {oracle(long_series)['is_correct'].mean() * 100:.2f}")

agent_name
code_gpt4o_03_february_goodoldtext-unbroken         38.36
code_gpt4o_03_february_magenticbrowser              35.22
code_gpt4o_03_february_magenticbrowser2             36.54
code_gpt4o_03_february_text                         37.58
code_o1_01_february_text                            49.09
code_o1_03_february_ablation-toolcalling-manager    32.73
code_o1_03_february_fix-print-outputs               51.83
code_o1_03_february_fix-print-outputs2              55.77
code_o1_03_february_goodoldtext-unbroken            53.42
code_o1_03_february_remove-navigational             53.66
code_o1_03_february_text_high-reasoning-effort      48.48
code_o1_04_february_submission                      49.38
code_o1_04_february_submission5                     55.15
code_o3-mini_03_february_remove-navigational        29.09
Name: is_correct, dtype: float64

Majority score: 58.18
Oracle score: 72.73






### Submit

In [18]:
agent_run = "code_o1_04_february_submission5.jsonl"
df = pd.read_json(f"output/validation/{agent_run}", lines=True)
df = df[["task_id", "prediction", "intermediate_steps"]]
df = df.rename(columns={"prediction": "model_answer", "intermediate_steps": "reasoning_trace"})

In [19]:
df.to_json("submission.jsonl", orient="records", lines=True)