# Full human friendly log - including model response and ground truth side-by-side
- Restart, and only run first cell to see the logs. 
- Choose your model in your first cell 
E.g. `model = "gpt-4o-mini-2024-07-18-FC"  # TODO: Input here`
- Choose your category
E.g. `category = "base"  # TODO: Input here`

Then, open your cell text output. 

Ignore the rest of the cells. 

This will produce human friendly log for that model on that entire category (i.e. 200 entries).

Dependency: `score` and `score_full` provided by @HuanzhiMao. If you want updated version of these, ask @HuanzhiMao. 

In [22]:
import os
import sys

from collections import defaultdict
from copy import deepcopy
import textwrap
from typing import List, Dict, Any
from collections import Counter
import json

score_full = "score_full"  # without error logs
score = "score"  # with error logs, and actual scores

model = "gpt-4o-mini-2024-07-18-FC"  # TODO: Input here
model = "gpt-4o-2024-08-06-FC"  # TODO: Input here

available_category_list = ["base", "miss_func",
                           "miss_param", "composite", "long_context"]
                           
# category = "base"  # TODO: Input here
category = "miss_func"  # TODO: Input here
if category == "miss_func":
    miss_func_info = []
    with open("../data/BFCL_v3_multi_turn_miss_func.json") as f:
        miss_func_info = [json.loads(line) for line in f.readlines()]
    miss_func_info = {item["id"]: item["missed_function"] for item in miss_func_info}

assert category in available_category_list, f"category {category} not in {available_category_list}"

INPUT_FILE = f"../{score_full}/{model}/BFCL_v3_multi_turn_{category}_score.json"
INPUT_ERROR_FILE = f"../{score}/{model}/BFCL_v3_multi_turn_{category}_score.json"

print(INPUT_FILE)
print(INPUT_ERROR_FILE)

# with open(INPUT_FILE, "r") as f:
#     data = [json.loads(line) for line in f.readlines()[1:]]

with open(INPUT_ERROR_FILE, "r") as f:
    data = [json.loads(line) for line in f.readlines()[1:]]



notebook_dir = os.getcwd()
parent_dir = os.path.dirname(notebook_dir)
sys.path.append(parent_dir)


MODEL_FAILED_KEYWORDS = [
    "issue",
    "couldn't",
    "could not",
    "can't",
    "cannot",
    "does not",
    "doesn't",
    "sorry",
    "apologize",
]


def process_json_log(data: List[Dict[str, Any]]) -> None:
    for log_entry in data:
        print_formatted_info(log_entry)
        print_model_result_stats(log_entry)

    print_failure_mode_analysis(data)


def print_dict(d: Dict[str, Any], indent: int = 0) -> None:
    for key, value in d.items():
        print(" " * indent + f"\"{key}\":")
        if isinstance(value, dict):
            print_dict(value, indent + 4)
        else:
            print(" " * (indent + 4) + f"- {value}")


main_apis = ["GorillaFileSystem",
             "VehicleControlAPI", "TradingBot", "TravelAPI"]


def get_main_api(involved_apis: List[str]) -> str:
    for api in involved_apis:
        if api in main_apis:
            return api
    return "N/A"


def print_formatted_info(log_entry: Dict[str, Any]) -> None:
    log_idx = log_entry.get('id', 'N/A')
    try:
        model_results = log_entry["model_result_raw"]
    except Exception as e:
        print(f"❗️🔟WARNING:MODEL_RESULT:UNEXPECTED_FORMAT {e}, check the model result for correctness")
        return
    possible_answers = log_entry["possible_answer"]
    questions = log_entry["prompt"]["question"]
    initial_config = log_entry["prompt"]["initial_config"]
    involved_apis = log_entry["prompt"]["involved_classes"]
    test_category = log_entry["test_category"]
    execution_responses = log_entry["execution_result"]
    error = log_entry.get("error", "N/A")
    error_type = log_entry.get("error_type", "N/A")
    print(f"🆔: {log_idx}")
    if error != "N/A":
        # print(type(error), error.keys())
        # print(f"❗️❗️❗️Error: {error}")
        if "details" in error:
            print(f"❗️❗️❗️Error Details: {json.dumps(error['details'])}")
        print(f"❗️❗️❗️Error Type: {error_type}")
    print(f"Model Name: {log_entry.get('model_name', 'N/A')}")
    print(f"Main API Classes: {get_main_api(involved_apis)}")
    print(f"Test Category: {log_entry.get('test_category', 'N/A')}")
    print("# 🥩Raw Initial Config (Generated)")
    try:
        for key, value in initial_config.items():
            print(f"API: {key}")
            print("    Initial Config:")
            print_dict(value, indent=8)
    except Exception as e:
        print(f"❗️7️⃣WARNING:INITIAL_CONFIG:UNEXPECTED_FORMAT {e}, check the initial config for correctness")


    print("\n# Model Results and Possible Answers")

    column_width = 80
    max_turns = len(questions)
    print(f"Max Turns: {max_turns}")
    for i in range(max_turns):
        print(f"\nTurn {i+1}:")
        if "miss_func" in category and i == int(list(miss_func_info[log_idx].keys())[0]) - 1:
            print(f"🌟 GT missed_function: {miss_func_info[log_idx]}")
        elif "miss_func" in category and i == int(list(miss_func_info[log_idx].keys())[0]):
            print(f"🫵 GT missed_function: {miss_func_info[log_idx]}")
        # Increased width for the new column
        print("-" * (column_width * 2 + 13))
        if questions[i]:
          print(f"Question: {questions[i][0]['content']}")
        else:
          print(f"Question: N/A")
        print("-" * (column_width * 2 + 13))
        print("Calls | Model Response".ljust(column_width + 8) +
              "| Possible Answer (Human Labeled Ground Truth)")
        print("-" * (column_width * 2 + 13))

        # Only print the model function calls and possible answers if the turn is not empty
        if i < len(model_results):
            model_turn = model_results[i]
            possible_turn = possible_answers[i] if i < len(
                possible_answers) else []

            max_items = max(len(model_turn) - 1, len(possible_turn))

            for j in range(max_items):
                model_lines = []
                possible_lines = []

                if j < len(model_turn) - 1:
                    item = model_turn[j]
                    if isinstance(item, dict):
                        for func, args in item.items():
                            func_call = f"{func}({args})"
                            model_lines = textwrap.wrap(
                                func_call, width=column_width-1)
                    else:
                        model_lines = textwrap.wrap(
                            str(item), width=column_width-1)

                if j < len(possible_turn):
                    possible_lines = textwrap.wrap(
                        possible_turn[j], width=column_width-1)

                max_lines = max(len(model_lines), len(possible_lines))

                for k in range(max_lines):
                    model_line = model_lines[k] if k < len(model_lines) else ""
                    possible_line = possible_lines[k] if k < len(
                        possible_lines) else ""
                    call_count = str(j+1) if k == 0 else ""
                    print(f"{call_count:5} | {model_line.ljust(column_width)}| {possible_line}")

            # Print the full textual response after the side-by-side comparison
            model_textual_response = model_turn[-1]
            if isinstance(model_textual_response, str):
                print("\nModel's textual response:")
                wrapped_response = textwrap.wrap(
                    model_textual_response, width=column_width * 2
                )
                for line in wrapped_response:
                    print(line)

                for keyword in MODEL_FAILED_KEYWORDS:
                    if keyword.lower() in model_textual_response.lower():
                        print(f"❗️1️⃣WARNING:MODEL_RESPONSE:MODEL_FAILED_KEYWORD '{keyword}' found in response for index {log_idx} turn {i+1} model text response")
                        break
            # Print warning if there is no model response
            if len(model_turn) == 1:
                print(f"❗️2️⃣WARNING:MODEL_RESPONSE:NO_MODEL_RESPONSE No model response for index {log_idx} turn {i+1}")

        else:
            print("DEBUGGING", i, len(model_results), len(possible_answers))
        print("\nExecution Responses:")
        print("-" * (column_width * 2 + 13))
        print("Call | Model Generated".ljust(
            column_width + 8) + "| Human Ground Truth")
        print("-" * (column_width * 2 + 13))

        if i < len(execution_responses):
            model_responses = execution_responses[i].get(
                'model', [])
            ground_truth_responses = execution_responses[i].get(
                'ground_truth', [])

            max_responses = max(len(model_responses),
                                len(ground_truth_responses))

            for j in range(max_responses):
                model_response = model_responses[j] if j < len(
                    model_responses) else ""
                ground_truth_response = ground_truth_responses[j] if j < len(
                    ground_truth_responses) else ""

                model_lines = textwrap.wrap(
                    str(model_response), width=column_width-1)
                ground_truth_lines = textwrap.wrap(
                    str(ground_truth_response), width=column_width-1)

                max_lines = max(len(model_lines), len(ground_truth_lines))

                for k in range(max_lines):
                    model_line = model_lines[k] if k < len(model_lines) else ""
                    ground_truth_line = ground_truth_lines[k] if k < len(
                        ground_truth_lines) else ""
                    call_count = str(j+1) if k == 0 else ""
                    print(f"{call_count:5} | {model_line.ljust(column_width)}| {ground_truth_line}")

                if j < max_responses - 1:
                    # Separator between calls
                    print("-" * (column_width * 2 + 13))
            if is_response_with_error(model_responses):
                print(f"❗️3️⃣WARNING:EXECUTION_RESPONSE:ERROR_IN_EXECUTION_RESPONSE Error in execution response for index {log_idx} turn {i+1}")

            if is_response_with_error(ground_truth_responses):
                print(f"❗️4️⃣WARNING:EXECUTION_RESPONSE:ERROR_IN_GROUND_TRUTH_RESPONSE Error in ground truth response for index {log_idx} turn {i+1}")

        print("\n" + "=" * (column_width * 2 + 13))
        # print("Execution States:")
        # print("-" * (column_width * 2 + 13))
        # print("Model Generated".ljust(column_width) + "| Human Ground Truth")
        # print("-" * (column_width * 2 + 13))

        # if i < len(execution_states):
        #     model_state = execution_states[i][0].get('model instance', {})
        #     ground_truth_state = execution_states[i][0].get(
        #         'ground truth instance', {})

        #     model_lines = textwrap.wrap(str(model_state), width=column_width-1)
        #     ground_truth_lines = textwrap.wrap(
        #         str(ground_truth_state), width=column_width-1)

        #     max_lines = max(len(model_lines), len(ground_truth_lines))

        #     for j in range(max_lines):
        #         model_line = model_lines[j] if j < len(model_lines) else ""
        #         ground_truth_line = ground_truth_lines[j] if j < len(
        #             ground_truth_lines) else ""
        #         print(f"{model_line.ljust(column_width)}| {ground_truth_line}")

        # print("\n" + "=" * (column_width * 2 + 13))  # Separator between turns


def is_response_with_error(responses):
    for response in responses:
        if isinstance(response, str):
            if "error" in response.lower():
                return True
        elif isinstance(response, dict):
            if "error" in response.keys():
                return True
    return False


def print_model_result_stats(log_entry: Dict[str, Any]) -> None:
    try:
        model_results = log_entry["model_result_raw"]
    except Exception as e:
        print(f"❗️🔟WARNING:MODEL_RESULT:UNEXPECTED_FORMAT {e}, check the model result for correctness")
        return
    possible_answers = log_entry["possible_answer"]
    questions = log_entry["prompt"]["question"]

    def count_function_calls(turn):
        count = 0
        for item in turn:
            if isinstance(item, dict):
                count += 1
            elif isinstance(item, list):
                count += sum(1 for sub_item in item if isinstance(sub_item, dict))
        return count

    model_turn_lengths = [count_function_calls(turn) for turn in model_results]
    possible_answer_turn_lengths = [len(turn) for turn in possible_answers]

    print("Model Result Statistics:")
    print(f"Total turns: {len(model_results)}")
    print(f"Model function calls per turn: {model_turn_lengths}")
    print(f"Possible answer function calls per turn: {possible_answer_turn_lengths}")

    if len(questions) != len(possible_answers):
        print(f"❗️5️⃣WARNING:HUMAN_LABELER Number of question turns ({len(questions)}) does not match number of human labeled possible answers ({len(possible_answers)})")

    if model_turn_lengths:
        print(
            f"Average model function calls per turn: {sum(model_turn_lengths) / len(model_turn_lengths):.2f}"
        )
        print(f"Max model function calls in a turn: {max(model_turn_lengths)}")
        print(f"Min model function calls in a turn: {min(model_turn_lengths)}")
    else:
        print("No model function calls recorded.")

    if possible_answer_turn_lengths:
        print(
            f"Average possible answer function calls per turn: {sum(possible_answer_turn_lengths) / len(possible_answer_turn_lengths):.2f}"
        )
        print(
            f"Max possible answer function calls in a turn: {max(possible_answer_turn_lengths)}"
        )
        print(
            f"Min possible answer function calls in a turn: {min(possible_answer_turn_lengths)}"
        )
    else:
        print("No possible answer function calls recorded.")

    print("\n" + "=" * 80 + "\n")


def print_failure_mode_analysis(data: List[Dict[str, Any]]) -> None:
    error_types = [log_entry.get("error_type", "N/A") for log_entry in data]
    error_counts = Counter(error_types)
    total_errors = len(error_types)

    print("Failure Mode Analysis:")
    for error_type, count in error_counts.items():
        percentage = (count / total_errors) * 100
        print(f"{error_type}: {count} occurrences ({percentage:.2f}%)")

    # Count errors for each API class
    api_error_counts = defaultdict(int)
    for log_entry in data:
        error = log_entry.get("error")
        error_type = log_entry.get("error_type")
        if error != "N/A" or error_type != "N/A":
            involved_apis = log_entry["prompt"]["involved_classes"]
            api_classes = get_main_api(involved_apis)
            api_error_counts[api_classes] += 1

    print("\nErrors per API class:")
    for api_class, count in api_error_counts.items():
        print(f"{api_class}: {count} errors")


if __name__ == "__main__":
    process_json_log(data)

../score_full/gpt-4o-2024-08-06-FC/BFCL_v3_multi_turn_miss_func_score.json
../score/gpt-4o-2024-08-06-FC/BFCL_v3_multi_turn_miss_func_score.json
🆔: multi_turn_miss_func_0
❗️❗️❗️Error Details: {"root": {"model": "<Directory: workspace, Parent: None, Contents: {'document': <Directory: document, Parent: workspace, Contents: {'final_report.pdf': <<File: final_report.pdf, Content: Year2024 This is the final report content including budget analysis and other sections.>>, 'previous_report.pdf': <<File: previous_report.pdf, Content: Year203 This is the previous report content with different budget analysis.>>}>, 'archive': <Directory: archive, Parent: workspace, Contents: {}>, 'temp': <Directory: temp, Parent: workspace, Contents: {}>}>", "ground_truth": "<Directory: workspace, Parent: None, Contents: {'document': <Directory: document, Parent: workspace, Contents: {'previous_report.pdf': <<File: previous_report.pdf, Content: Year203 This is the previous report content with different budget ana

# [DEPRECATED] Single File (for gpt-4o log views)

In [16]:
import json 
V = 14
category = "multi_turn_base"
BFCL_VERSION = 3
INPUT_FILE = f"BFCL_v{BFCL_VERSION}_{category}_score_v{V}_full.json" # with logs
INPUT_ERROR_FILE = f"BFCL_v{BFCL_VERSION}_{category}_score_v{V}.json" # with error
print(INPUT_FILE)
print(INPUT_ERROR_FILE)
with open(INPUT_FILE, "r") as f:
    data = [json.loads(line) for line in f.readlines()[1:]]

with open(INPUT_ERROR_FILE, "r") as f:
    data_error = [json.loads(line) for line in f.readlines()[1:]]

# Add a new field to data if there is error and error_type. Some entries might be missing in data_error
for entry in data:
    entry["error"] = "N/A"
    entry["error_type"] = "N/A"
    for entry_error in data_error:
        if entry_error["id"] == entry["id"] and not entry_error["valid"]:
            entry["error"] = entry_error["error"]
            entry["error_type"] = entry_error["error_type"]
            break

BFCL_v3_multi_turn_base_score_v14_full.json
BFCL_v3_multi_turn_base_score_v14.json


{'id': 'multi_turn_base_3',
 'model_name': 'gpt-4o-2024-08-06-FC',
 'test_category': 'multi_turn_base',
 'valid': False,
 'error': ['Model was force-terminated during inference phase. The length of the model result turns (2) does not match the length of the ground truth turns (4).'],
 'error_type': 'multi_turn:force_terminated',
 'prompt': {'id': 'multi_turn_base_3',
  'question': [[{'role': 'user',
     'content': 'I am alex. Go into directory name after my name and list all the visible and hidden contents in the current directory now, please.'}],
   [{'role': 'user',
     'content': "Move one of the 'log.txt' files into a new directory 'archive'."}],
   [{'role': 'user',
     'content': "Investigate within 'log.txt' for the occurrence of the keyword 'Error'."}],
   [{'role': 'user', 'content': 'Finally, show the last 20 lines the file.'}]],
  'initial_config': {'GorillaFileSystem': {'root': {'alex': {'type': 'directory',
      'contents': {'workspace': {'type': 'directory',
        '

In [8]:
import json
from collections import Counter
from typing import List, Dict, Any
import textwrap
import sys
import os
import os
from copy import deepcopy
from collections import defaultdict

notebook_dir = os.getcwd()
parent_dir = os.path.dirname(notebook_dir)
sys.path.append(parent_dir)

from api_interface.default_scenario import DEFAULT_SCENARIO

MODEL_FAILED_KEYWORDS = [
    "issue",
    "couldn't",
    "could not",
    "can't",
    "cannot",
    "does not",
    "doesn't",
    "further assistance",
    "sorry",
    "apologize",
]

def process_json_log(data: List[Dict[str, Any]]) -> None:
    for log_entry in data:
        print_formatted_info(log_entry)
        print_model_result_stats(log_entry)

    print_failure_mode_analysis(data)


def print_dict(d: Dict[str, Any], indent: int = 0) -> None:
    for key, value in d.items():
        print(" " * indent + f"\"{key}\":")
        if isinstance(value, dict):
            print_dict(value, indent + 4)
        else:
            print(" " * (indent + 4) + f"- {value}")
main_apis = ["GorillaFileSystem", "VehicleControlAPI", "TradingBot", "TravelAPI"]
def get_main_api(involved_apis: List[str]) -> str:
    for api in involved_apis:
        if api in main_apis:
            return api
    return "N/A"

def print_formatted_info(log_entry: Dict[str, Any]) -> None:
    if "log" not in log_entry:
        return
    log_idx = log_entry.get('id', 'N/A')
    model_results = log_entry["model_result"]
    possible_answers = log_entry["possible_answer"]
    questions = log_entry["prompt"]["question"]
    initial_config = log_entry["prompt"]["initial_config"]
    involved_apis = log_entry["prompt"]["involved_classes"]
    execution_states = log_entry["log"][::2]
    execution_responses = log_entry["log"][1::2]
    error = log_entry.get("error", "N/A")
    error_type = log_entry.get("error_type", "N/A")
    print(f"🆔: {log_idx}")
    if error != "N/A":
        print(f"❗️❗️❗️Error: {error}")
        print(f"❗️❗️❗️Error Type: {error_type}")
    print(f"Model Name: {log_entry.get('model_name', 'N/A')}")
    print(f"Main API Classes: {get_main_api(involved_apis)}")
    print(f"Test Category: {log_entry.get('test_category', 'N/A')}")
    print("# 🥩Raw Initial Config (Generated)")
    try:
        for key, value in initial_config.items():
            print(f"API: {key}")
            print("    Initial Config:")
            print_dict(value, indent=8)
    except Exception as e:
        print(f"❗️7️⃣WARNING:INITIAL_CONFIG:UNEXPECTED_FORMAT {e}, check the initial config for correctness")

    print("# 🔨Processed Config (Defaults with Raw Initial Config)")
    # Have a copy of the default scenarios

    processed_config = deepcopy(DEFAULT_SCENARIO)
    # Overwrite the default initial config with the log_entry initial config for the APIs that are involved
    for api_name in involved_apis:
        current_api_config = processed_config[api_name]
        for key, value in current_api_config.items():
            if api_name in initial_config:
                if key in initial_config[api_name]: # Only overwrite if the key exists in the initial config
                    current_api_config[key] = initial_config[api_name][key]
            else:
                print(f"6️⃣INFO:INITIAL_CONFIG:API_NOT_FOUND {api_name} not found in initial config, use default config")
                break
        print(f"API: {api_name}")
        print("    Processed Initial Config:")
        print_dict(current_api_config, indent=8)
    print("# Error")
    print(f"Error: {log_entry.get('error', 'N/A')}")
    print(f"Error Type: {log_entry.get('error_type', 'N/A')}")

    print("\n# Model Results and Possible Answers")

    column_width = 80
    max_turns = len(questions)
    for i in range(max_turns):
        print(f"\nTurn {i+1}:")
        print("-" * (column_width * 2 + 13))  # Increased width for the new column

        print(f"Question: {questions[i][0]['content']}")

        print("-" * (column_width * 2 + 13))
        print("Calls | Model Response".ljust(column_width + 8) + "| Possible Answer (Human Labeled Ground Truth)")
        print("-" * (column_width * 2 + 13))

        if i < len(model_results): # Only print the model function calls and possible answers if the turn is not empty
            model_turn = model_results[i]
            possible_turn = possible_answers[i] if i < len(possible_answers) else []

            max_items = max(len(model_turn) - 1, len(possible_turn))

            for j in range(max_items):
                model_lines = []
                possible_lines = []

                if j < len(model_turn) - 1:
                    item = model_turn[j]
                    if isinstance(item, dict):
                        for func, args in item.items():
                            func_call = f"{func}({args})"
                            model_lines = textwrap.wrap(func_call, width=column_width-1)
                    else:
                        model_lines = textwrap.wrap(str(item), width=column_width-1)

                if j < len(possible_turn):
                    possible_lines = textwrap.wrap(possible_turn[j], width=column_width-1)

                max_lines = max(len(model_lines), len(possible_lines))

                for k in range(max_lines):
                    model_line = model_lines[k] if k < len(model_lines) else ""
                    possible_line = possible_lines[k] if k < len(possible_lines) else ""
                    call_count = str(j+1) if k == 0 else ""
                    print(f"{call_count:5} | {model_line.ljust(column_width)}| {possible_line}")

            # Print the full textual response after the side-by-side comparison
            model_textual_response = model_turn[-1]
            if isinstance(model_textual_response, str):
                print("\nModel's textual response:")
                wrapped_response = textwrap.wrap(
                    model_textual_response, width=column_width * 2
                )
                for line in wrapped_response:
                    print(line)

                for keyword in MODEL_FAILED_KEYWORDS:
                    if keyword.lower() in model_textual_response.lower():
                        print(f"❗️1️⃣WARNING:MODEL_RESPONSE:MODEL_FAILED_KEYWORD '{keyword}' found in response for index {log_idx} turn {i+1} model text response")
                        break
            # Print warning if there is no model response
            if len(model_turn) == 1:
                print(f"❗️2️⃣WARNING:MODEL_RESPONSE:NO_MODEL_RESPONSE No model response for index {log_idx} turn {i+1}")

        else:
            print("DEBUGGING", i, len(model_results), len(possible_answers))
        print("\nExecution Responses:")
        print("-" * (column_width * 2 + 13))
        print("Call | Model Generated".ljust(column_width + 8) + "| Human Ground Truth")
        print("-" * (column_width * 2 + 13))

        if i < len(execution_responses):
            model_responses = execution_responses[i][0].get('model response', [])
            ground_truth_responses = execution_responses[i][0].get('ground truth response', [])

            max_responses = max(len(model_responses), len(ground_truth_responses))

            for j in range(max_responses):
                model_response = model_responses[j] if j < len(model_responses) else ""
                ground_truth_response = ground_truth_responses[j] if j < len(ground_truth_responses) else ""

                model_lines = textwrap.wrap(str(model_response), width=column_width-1)
                ground_truth_lines = textwrap.wrap(str(ground_truth_response), width=column_width-1)

                max_lines = max(len(model_lines), len(ground_truth_lines))

                for k in range(max_lines):
                    model_line = model_lines[k] if k < len(model_lines) else ""
                    ground_truth_line = ground_truth_lines[k] if k < len(ground_truth_lines) else ""
                    call_count = str(j+1) if k == 0 else ""
                    print(f"{call_count:5} | {model_line.ljust(column_width)}| {ground_truth_line}")

                if j < max_responses - 1:
                    print("-" * (column_width * 2 + 13))  # Separator between calls
            if is_response_with_error(model_responses):
                print(f"❗️3️⃣WARNING:EXECUTION_RESPONSE:ERROR_IN_EXECUTION_RESPONSE Error in execution response for index {log_idx} turn {i+1}")

            if is_response_with_error(ground_truth_responses):
                print(f"❗️4️⃣WARNING:EXECUTION_RESPONSE:ERROR_IN_GROUND_TRUTH_RESPONSE Error in ground truth response for index {log_idx} turn {i+1}")

        print("\n" + "=" * (column_width * 2 + 13))
        print("Execution States:")
        print("-" * (column_width * 2 + 13))
        print("Model Generated".ljust(column_width) + "| Human Ground Truth")
        print("-" * (column_width * 2 + 13))

        if i < len(execution_states):
            model_state = execution_states[i][0].get('model instance', {})
            ground_truth_state = execution_states[i][0].get('ground truth instance', {})

            model_lines = textwrap.wrap(str(model_state), width=column_width-1)
            ground_truth_lines = textwrap.wrap(str(ground_truth_state), width=column_width-1)

            max_lines = max(len(model_lines), len(ground_truth_lines))

            for j in range(max_lines):
                model_line = model_lines[j] if j < len(model_lines) else ""
                ground_truth_line = ground_truth_lines[j] if j < len(ground_truth_lines) else ""
                print(f"{model_line.ljust(column_width)}| {ground_truth_line}")

        print("\n" + "=" * (column_width * 2 + 13))  # Separator between turns

def is_response_with_error(responses):
    for response in responses:
        if isinstance(response, str):
            if "error" in response.lower():
                return True
        elif isinstance(response, dict):
            if "error" in response.keys():
                return True
    return False


def print_model_result_stats(log_entry: Dict[str, Any]) -> None:
    model_results = log_entry["model_result"]
    possible_answers = log_entry["possible_answer"]
    questions = log_entry["prompt"]["question"]
    def count_function_calls(turn):
        count = 0
        for item in turn:
            if isinstance(item, dict):
                count += 1
            elif isinstance(item, list):
                count += sum(1 for sub_item in item if isinstance(sub_item, dict))
        return count

    model_turn_lengths = [count_function_calls(turn) for turn in model_results]
    possible_answer_turn_lengths = [len(turn) for turn in possible_answers]

    print("Model Result Statistics:")
    print(f"Total turns: {len(model_results)}")
    print(f"Model function calls per turn: {model_turn_lengths}")
    print(f"Possible answer function calls per turn: {possible_answer_turn_lengths}")

    if len(questions) != len(possible_answers):
        print(f"❗️5️⃣WARNING:HUMAN_LABELER Number of question turns ({len(questions)}) does not match number of human labeled possible answers ({len(possible_answers)})")

    if model_turn_lengths:
        print(
            f"Average model function calls per turn: {sum(model_turn_lengths) / len(model_turn_lengths):.2f}"
        )
        print(f"Max model function calls in a turn: {max(model_turn_lengths)}")
        print(f"Min model function calls in a turn: {min(model_turn_lengths)}")
    else:
        print("No model function calls recorded.")

    if possible_answer_turn_lengths:
        print(
            f"Average possible answer function calls per turn: {sum(possible_answer_turn_lengths) / len(possible_answer_turn_lengths):.2f}"
        )
        print(
            f"Max possible answer function calls in a turn: {max(possible_answer_turn_lengths)}"
        )
        print(
            f"Min possible answer function calls in a turn: {min(possible_answer_turn_lengths)}"
        )
    else:
        print("No possible answer function calls recorded.")

    print("\n" + "=" * 80 + "\n")


def print_failure_mode_analysis(data: List[Dict[str, Any]]) -> None:
    error_types = [log_entry.get("error_type", "N/A") for log_entry in data]
    error_counts = Counter(error_types)
    total_errors = len(error_types)

    print("Failure Mode Analysis:")
    for error_type, count in error_counts.items():
        percentage = (count / total_errors) * 100
        print(f"{error_type}: {count} occurrences ({percentage:.2f}%)")

    # Count errors for each API class
    api_error_counts = defaultdict(int)
    for log_entry in data:
        error = log_entry.get("error")
        error_type = log_entry.get("error_type")
        if error != "N/A" or error_type != "N/A":
            involved_apis = log_entry["prompt"]["involved_classes"]
            api_classes = get_main_api(involved_apis)
            api_error_counts[api_classes] += 1

    print("\nErrors per API class:")
    for api_class, count in api_error_counts.items():
        print(f"{api_class}: {count} errors")


if __name__ == "__main__":    
    process_json_log(data)

🆔: multi_turn_base_0
Model Name: claude-3-5-sonnet-20240620-FC
Main API Classes: GorillaFileSystem
Test Category: multi_turn_base
# 🥩Raw Initial Config (Generated)
API: GorillaFileSystem
    Initial Config:
        "root":
            "workspace":
                "type":
                    - directory
                "contents":
                    "document":
                        "type":
                            - directory
                        "contents":
                            "final_report.pdf":
                                "type":
                                    - file
                                "content":
                                    - Year2024 This is the final report content including budget analysis and other sections.
                            "previous_report.pdf":
                                "type":
                                    - file
                                "content":
                                    - Year203 This 