# 1. 테스트 데이터 전처리

In [None]:
import re
import json
import pandas as pd
from typing import List, Dict
from datasets import load_dataset, Dataset
from vllm import LLM, Samplingparams

In [None]:
# 허깅페이스 허브에서 데이터셋 로드
dataset = load_dataset("HJUNN/crypto_function_calling_datasets", split = "train")

In [None]:
# 테스트 비율 설정
test_ratio = 0.2

total_len = len(dataset)
test_size = int(total_len * test_ratio)

test_indices = list(range(test_size))
train_indices = list(range(test_size, total_len))

In [None]:
#OpenAI 포맷으로 변환 함수
def format_conversation(sample):
    return {
        "messages" : [
            {"role" : "system", "content" : sample["system_prompt"]},
            *sample["messages"]
        ]
    }
# 분할 및 변환
train_dataset = [format_conversation(dataset[i]) for i in train_indices]
test_dataset = [format_conversation(dataset[i]) for i in test_indices]

# Dataset 객체로 변환
train_dataset = Dataset.from_list(train_dataset)
test_dataset = Dataset.from_list(test_dataset)

In [None]:
def to_chatml(data):
    messages = data.get("messages") if isinstance(data, dict) and "messages" in data else data

    parts = []
    for msg in messages:
        role = msg["role"]
        content = msg["content"]
        parts.append(f"<|im_start|>{role}\n{content}<|im_end|>")
    return "\n".join(parts)

In [None]:
def extract_examples(chatml:str) -> List[Dict[str, str]]:
    examples : List[Dict[str, str]] = []
    pattern = re.compile(r'<\|im_start\|>assistant(.*?)(?=<\|im_end\|>)', re.DOTALL)

    for match in pattern.finditer(chatml):
        start_idx = match.start()
        input_text = chatml[:start_idx].strip() + '\n<|im_start|>assistant'
        label_text = match.group(1).strip()
        examples.append({
            "input" : input_text,
            "label" : label_text
        })

    return examples

In [None]:
prompt_lst = []
label_lst = []

for item in test_dataset:
    chatml = to_chatml(item)
    examples  = extract_examples(chatml)

    for ex in examples:
        prompt_lst.append(ex["input"])
        label_lst.append(ex["label"])

In [None]:
print(label_ls[0])

#2. 모델 호출

In [None]:
sampling_params = SamplingParams(
    temperature = 0,
    max_tokens = 2048,
    stop = ["<|im_end|>"]
)

In [None]:
llm = LLM(model = "Qwen/Qwen2.5-7B-Instruct")

In [None]:
base_model_outputs = llm.generate(prompt_lst, sampling_params)
base_model_text_results = [sample.outputs[0].text.strip() for sample in base_mdel_outputs]

In [None]:
print(base_model_text_results[10])

# 3. 평가 결과 저장

In [1]:
df = pd.DataFrame({
    "prompt" : prompt_lst,
    "label" : label_lst,
    "output" : base_model_text_results
})

df.to_csv("base_model_evaluation_results.csv", index = False, encoding = "utf-8-sig")

NameError: name 'pd' is not defined

# 4. 평가

In [None]:
# BFCL 확장 평가
import re
import json

def evaluate_function_calls_bfcl(labels, predictions):
    """
    BFCL 스타일 function calling 평가 함수
    """

    results = {
        "tool_selection": {"correct": 0, "total": 0},
        "params_selection": {"correct": 0, "total": 0},
        "params_value_accuracy": {"correct": 0.0, "total": 0},
        "hallucinated_args": {"count": 0},
        "over_call": {"count": 0},
        "under_call": {"count": 0},
        "format_error": {"count": 0}
    }

    tool_call_pattern = re.compile(r"<tool_call>(.*?)</tool_call>", re.DOTALL)
    tool_call_count = 0

    for label, pred in zip(labels, predictions):

        label_match = tool_call_pattern.search(label)
        pred_match = tool_call_pattern.search(pred)

        # ---------- BFCL: over-call / under-call ----------
        if label_match and not pred_match:
            results["under_call"]["count"] += 1
            continue

        if not label_match and pred_match:
            results["over_call"]["count"] += 1
            continue

        if not label_match:
            continue

        tool_call_count += 1

        # ---------- format robustness ----------
        try:
            label_json = json.loads(label_match.group(1))
            pred_json = json.loads(pred_match.group(1))
        except Exception:
            results["format_error"]["count"] += 1
            continue

        # ---------- 1. Tool selection ----------
        results["tool_selection"]["total"] += 1
        if label_json.get("name") == pred_json.get("name"):
            results["tool_selection"]["correct"] += 1

        # ---------- 2. Parameter selection ----------
        label_params = set(label_json.get("arguments", {}).keys())
        pred_params = set(pred_json.get("arguments", {}).keys())

        # 정답 파라미터 기준 평가
        for p in label_params:
            results["params_selection"]["total"] += 1
            if p in pred_params:
                results["params_selection"]["correct"] += 1

        # 환각 파라미터
        hallucinated = pred_params - label_params
        results["hallucinated_args"]["count"] += len(hallucinated)
        results["params_selection"]["total"] += len(hallucinated)

        # ---------- 3. Parameter value accuracy (partial scoring) ----------
        common_params = label_params & pred_params
        if common_params:
            match_cnt = 0
            for k in common_params:
                if label_json["arguments"][k] == pred_json["arguments"][k]:
                    match_cnt += 1

            results["params_value_accuracy"]["correct"] += (
                match_cnt / len(common_params)
            )
            results["params_value_accuracy"]["total"] += 1

    # ---------- 최종 BFCL-style 지표 ----------
    final_results = {
        "tool_accuracy": (
            results["tool_selection"]["correct"] /
            results["tool_selection"]["total"]
            if results["tool_selection"]["total"] > 0 else 0.0
        ),
        "param_key_accuracy": (
            results["params_selection"]["correct"] /
            results["params_selection"]["total"]
            if results["params_selection"]["total"] > 0 else 0.0
        ),
        "param_value_accuracy": (
            results["params_value_accuracy"]["correct"] /
            results["params_value_accuracy"]["total"]
            if results["params_value_accuracy"]["total"] > 0 else 0.0
        ),
        "hallucination_rate": (
            results["hallucinated_args"]["count"] / tool_call_count
            if tool_call_count > 0 else 0.0
        ),
        "over_call_rate": (
            results["over_call"]["count"] / len(labels)
        ),
        "under_call_rate": (
            results["under_call"]["count"] / len(labels)
        ),
        "format_error_rate": (
            results["format_error"]["count"] / tool_call_count
            if tool_call_count > 0 else 0.0
        ),
        "total_tool_calls": tool_call_count
    }

    return final_results


In [None]:
labels = df["label"].tolist()
preds = df["output"].tolist()

results = evaluate_function_calls_bfcl(labels, preds)

for metric, value in results.items():
    if isinstance(value, float):
        print(f"{metric}: {value:.2%}")
    else:
        print(f"{metric}: {value}")
