In [1]:
import json
import pandas as pd
from huanzhi_utils import load_file, write_list_of_dicts_to_file
import os

In [9]:
result_file_path = '/Users/raymondtsao/Documents/BFCL_Visualization_Test/result_score/result'
models = [model for model in os.listdir(result_file_path) if model != '.DS_Store']
print(models)

['Qwen_Qwen2.5-1.5B-Instruct', 'deepseek-ai_DeepSeek-Coder-V2-Instruct-0724', 'palmyra-x-004', 'claude-3-opus-20240229', 'gemini-1.5-pro-002-FC', 'open-mixtral-8x22b-FC', 'gemini-1.5-flash-002', 'openbmb_MiniCPM3-4B', 'gemini-1.0-pro-002', 'ibm-granite_granite-20b-functioncalling', 'Salesforce_xLAM-8x22b-r', 'Nexusflow-Raven-v2', 'mistral-medium-2312', 'Qwen_Qwen2-7B-Instruct', 'Salesforce_xLAM-7b-r', 'Qwen_Qwen2.5-72B-Instruct', 'firefunction-v1-FC', 'deepseek-ai_DeepSeek-Coder-V2-Lite-Instruct', 'meta-llama_Llama-3.1-70B-Instruct', 'mistral-small-2402-FC', 'gemini-1.0-pro-002-FC', 'databricks-dbrx-instruct', 'open-mixtral-8x7b', 'claude-3-opus-20240229-FC', 'gpt-3.5-turbo-0125-FC', 'gemini-1.5-flash-002-FC', 'NousResearch_Hermes-2-Pro-Mistral-7B', 'NousResearch_Hermes-2-Pro-Llama-3-8B', 'google_gemma-2-27b-it', 'MadeAgents_Hammer2.0-0.5b', 'Team-ACE_ToolACE-8B', 'open-mistral-nemo-2407', 'mistral-small-2402', 'gpt-4o-2024-08-06-FC', 'google_gemma-2-2b-it', 'gpt-4o-mini-2024-07-18-FC'

In [25]:
result_standardized = []
import json

def standardize_step_response(input_step_response):
    step_response = {}
    step_response["assistant_response"] = input_step_response[0]
    step_response["handler_response"]  = input_step_response[1]
    step_response["tool_response"] = input_step_response[2:]
    step_response["num_tools"] = len(
        step_response["handler_response"].get("model_response_decoded", [])
    )
    return step_response

def standardize_turn_response(result):
    input_turn_response, end_of_turn_state = result    
    turn_response = {}
    turn_response["end_of_turn_state"] = end_of_turn_state
    turn_response["turn_eval_message"] = input_turn_response["begin_of_turn_query"]
    keys = list(input_turn_response.keys())
    keys.remove("begin_of_turn_query") 
    keys = sorted(keys, key=lambda x: int(x.split("_")[-1])) 
    step_responses = []
    for idx, key in enumerate(keys):
        assert key.endswith(f"_{idx}")
        step_responses.append(input_turn_response[key])
    
    turn_response["step_responses"] = [standardize_step_response(step_response) for step_response in step_responses]
    turn_response["num_steps"] = len(step_responses) - 1
    return turn_response

def standardize_inference_response(result):
    inference_response = {}
    inference_response["id"] = result["id"]
    inference_log = result["inference_log"]
    inference_response["initial_api_state"] = inference_log[0] # a list of states
    num_turns = (len(inference_log) - 1)//2 # the first one is the initial state
    inference_response["num_turns"] = num_turns
    inference_log = inference_log[1:] # remove the initial state
    inference_response["turn_responses"] = [
        standardize_turn_response(inference_log[i : i + 2])
        for i in range(0, len(inference_log), 2)
    ]
    return inference_response

def standardize_inference_response_error(result):
    inference_response = {}
    inference_response["id"] = result["id"]
    inference_response["num_turns"] = 1
    inference_response["turn_responses"] = [
        'Error during inference: Request timed out.'
    ]
    return inference_response

In [27]:
errored_model = []
for model in models:
    result_file_path = f'/Users/raymondtsao/Documents/BFCL_Visualization_Test/result_score/result/{model}/BFCL_v3_multi_turn_base_result.json'
    result_list = load_file(result_file_path)
    for result in result_list:
        try:
            result_standardized.append(standardize_inference_response(result))
        except Exception as e:
            errored_model.append(model)
            try:
                if result['result'] == 'Error during inference: Request timed out.':
                    result_standardized.append(standardize_inference_response_error(result))
            except Exception as e:
                print(f"Error standardizing {model}: {e}")
                continue

In [29]:
models_not_errored = [model for model in models if model not in errored_model]
print(models_not_errored)


['Qwen_Qwen2.5-1.5B-Instruct', 'claude-3-opus-20240229', 'gemini-1.5-pro-002-FC', 'gemini-1.5-flash-002', 'openbmb_MiniCPM3-4B', 'gemini-1.0-pro-002', 'Salesforce_xLAM-8x22b-r', 'Nexusflow-Raven-v2', 'Qwen_Qwen2-7B-Instruct', 'Salesforce_xLAM-7b-r', 'Qwen_Qwen2.5-72B-Instruct', 'firefunction-v1-FC', 'deepseek-ai_DeepSeek-Coder-V2-Lite-Instruct', 'meta-llama_Llama-3.1-70B-Instruct', 'mistral-small-2402-FC', 'databricks-dbrx-instruct', 'claude-3-opus-20240229-FC', 'gpt-3.5-turbo-0125-FC', 'NousResearch_Hermes-2-Pro-Mistral-7B', 'NousResearch_Hermes-2-Pro-Llama-3-8B', 'MadeAgents_Hammer2.0-0.5b', 'Team-ACE_ToolACE-8B', 'open-mistral-nemo-2407', 'mistral-small-2402', 'gpt-4o-2024-08-06-FC', 'google_gemma-2-2b-it', 'gpt-4o-mini-2024-07-18-FC', 'meta-llama_Llama-3.2-1B-Instruct', 'MadeAgents_Hammer2.0-1.5b', 'gemini-1.5-pro-001', 'meta-llama_Llama-3.1-8B-Instruct-FC', 'command-r-plus', 'openbmb_MiniCPM3-4B-FC', 'mistral-large-2407-FC', 'claude-3-5-sonnet-20241022', 'gemini-1.5-flash-001', 'o

In [22]:
result_file_path = f'/Users/raymondtsao/Documents/BFCL_Visualization_Test/result_score/result/{errored_model[1]}/BFCL_v3_multi_turn_base_result.json'
result_list = load_file(result_file_path)

In [23]:
for i, result in enumerate(result_list):
    try:
        s = standardize_inference_response(result)
    except Exception as e:
        print(f"Error standardizing {i}: {e}")
        continue


Error standardizing 0: 'inference_log'
Error standardizing 2: 'inference_log'
Error standardizing 21: 'inference_log'
Error standardizing 39: 'inference_log'
Error standardizing 40: 'inference_log'
Error standardizing 52: 'inference_log'
Error standardizing 56: 'inference_log'
Error standardizing 61: 'inference_log'
Error standardizing 68: 'inference_log'
Error standardizing 69: 'inference_log'
Error standardizing 70: 'inference_log'
Error standardizing 71: 'inference_log'
Error standardizing 72: 'inference_log'
Error standardizing 73: 'inference_log'
Error standardizing 74: 'inference_log'
Error standardizing 75: 'inference_log'
Error standardizing 76: 'inference_log'
Error standardizing 77: 'inference_log'
Error standardizing 78: 'inference_log'
Error standardizing 79: 'inference_log'
Error standardizing 80: 'inference_log'
Error standardizing 81: 'inference_log'
Error standardizing 82: 'inference_log'
Error standardizing 83: 'inference_log'
Error standardizing 84: 'inference_log'
Er

In [24]:
result_list[0]

{'id': 'multi_turn_base_0',
 'result': 'Error during inference: Request timed out.'}

In [11]:
print(f"Model error rate: {len(errored_model)/len(models)}")

Model error rate: 0.2891566265060241
