In [2]:
import json
import pandas as pd
from huanzhi_utils import load_file, write_list_of_dicts_to_file
import os
from const import *

In [3]:
models = [model for model in MODELS if model in os.listdir('/Users/raymondtsao/Documents/BFCL_Visualization_Test/result_score/score')]
print(len(models))

result_standardized = []
import json

def standardize_step_response(input_step_response):
    step_response = {}
    step_response["assistant_response"] = input_step_response[0]
    step_response["handler_response"]  = input_step_response[1]
    step_response["tool_response"] = input_step_response[2:]
    step_response["num_tools"] = len(
        step_response["handler_response"].get("model_response_decoded", [])
    )
    return step_response

def standardize_turn_response(result):
    input_turn_response, end_of_turn_state = result    
    turn_response = {}
    turn_response["end_of_turn_state"] = end_of_turn_state
    turn_response["turn_eval_message"] = input_turn_response["begin_of_turn_query"]
    keys = list(input_turn_response.keys())
    keys.remove("begin_of_turn_query") 
    keys = sorted(keys, key=lambda x: int(x.split("_")[-1])) 
    step_responses = []
    for idx, key in enumerate(keys):
        assert key.endswith(f"_{idx}")
        step_responses.append(input_turn_response[key])
    
    turn_response["step_responses"] = [standardize_step_response(step_response) for step_response in step_responses]
    turn_response["num_steps"] = len(step_responses) - 1
    return turn_response

def standardize_inference_response(result):
    inference_response = {}
    inference_response["id"] = result["id"]
    inference_log = result["inference_log"]
    inference_response["initial_api_state"] = inference_log[0] # a list of states
    num_turns = (len(inference_log) - 1)//2 # the first one is the initial state
    inference_response["num_turns"] = num_turns
    inference_log = inference_log[1:] # remove the initial state
    inference_response["turn_responses"] = [
        standardize_turn_response(inference_log[i : i + 2])
        for i in range(0, len(inference_log), 2)
    ]
    return inference_response

def standardize_inference_response_error(result):
    inference_response = {}
    inference_response["id"] = result["id"]
    inference_response["num_turns"] = 1
    inference_response["turn_responses"] = [
        'Error during inference: Request timed out.'
    ]
    return inference_response

31


In [4]:
def get_result_dataframe(model, category):
    result_file_path = f'/Users/raymondtsao/Documents/BFCL_Visualization_Test/result_score/result/{model}/BFCL_v3_multi_turn_{category}_result.json'
    result_list = load_file(result_file_path)
    result_standardized = []
    for result in result_list:
        try:
            result_standardized.append(standardize_inference_response(result))
        except Exception as e:
            return None
    return pd.DataFrame(result_standardized)

def get_score_dataframe(model, category):
    def extract_error_types(error_dict):
        error_types = []
        for key, value in error_dict.items():
            if key == "error_type":
                error_types.append(value)
            elif isinstance(value, dict) and "error_type" in value:
                error_types.append(value["error_type"])
        return error_types
    try:
        score_file_path = f'/Users/raymondtsao/Documents/BFCL_Visualization_Test/result_score/score/{model}/BFCL_v3_multi_turn_{category}_score.json'
        score_list = load_file(score_file_path)
        data = []
        score_list_metrics = score_list[1:]  # First entry in the score json file is an accuracy summary and that is not useful for now
        for item in score_list_metrics:
            data.append({
                "id": item["id"],
                "valid": item["valid"],
                "error_type": extract_error_types(item["error"])
            })
        score_df = pd.DataFrame(data)
        return score_df
    except Exception as e:
        print(f"Error processing score file {score_file_path}: {e}")
        return None


In [6]:
errored_model_cateogry = []
for model in models:
    for category in ['base', 'long_context', 'miss_func', 'miss_param']:
        result_df = get_result_dataframe(model, category)
        score_df = get_score_dataframe(model, category)
        if result_df is None or score_df is None:
            errored_model_cateogry.append((model, category))

In [11]:
print(len(errored_model_cateogry) / (4*len(MODELS)))

0.15677966101694915


In [8]:
print(len(errored_model_cateogry) / (4*len(models)))

0.23387096774193547
