In [2]:
!pip install vllm==0.8.2 datasets

Collecting vllm==0.8.2
  Downloading vllm-0.8.2-cp38-abi3-manylinux1_x86_64.whl.metadata (27 kB)
Collecting numpy<2.0.0 (from vllm==0.8.2)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting blake3 (from vllm==0.8.2)
  Downloading blake3-1.0.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm==0.8.2)
  Downloading prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl.metadata (13 kB)
Collecting lm-format-enforcer<0.11,>=0.10.11 (from vllm==0.8.2)
  Downloading lm_format_enforcer-0.10.12-py3-none-any.whl.metadata (17 kB)
Collecting llguidance<0.8.0,>=0.7.9 (from vllm==0.8.2)
  Downloading llguidance-0.7.30-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting outlines==0.1.11 (from vllm

# 1. 테스트 데이터 전처리

In [1]:
import re
import json
import pandas as pd
from typing import List, Dict
from datasets import load_dataset, Dataset
from vllm import LLM, SamplingParams

INFO 12-23 13:29:38 [__init__.py:239] Automatically detected platform cuda.


In [15]:
# 허깅페이스 허브에서 데이터셋 로드
dataset = load_dataset("HJUNN/crypto_function_calling_datasets", split = "train")

In [16]:
dataset

Dataset({
    features: ['tools', 'cid', 'dates', 'messages', 'system_prompt'],
    num_rows: 384
})

In [17]:
# 테스트 비율 설정
test_ratio = 0.2

# 전체 길이와 테스트 데이터 크기 계산
total_len = len(dataset)
test_size = int(total_len * test_ratio)

# 앞에서부터 테스트 데이터, 나머지는 학습 데이터
test_indices = list(range(test_size))
train_indices = list(range(test_size, total_len))

In [18]:
len(train_indices)

308

In [27]:
#OpenAI 포멧으로 변환 함수
def format_conversations(sample):
    return {
        "messages": [
            {"role": "system", "content": sample["system_prompt"]},
            *sample["messages"]
        ]
    }


In [28]:
# 분할 및 변환
train_dataset = [format_conversations(dataset[i]) for i in train_indices]
test_dataset = [format_conversations(dataset[i]) for i in test_indices]

# 리스트를 다시 HuggingFace Dataset 객체로 변환
train_dataset = Dataset.from_list(train_dataset)
test_dataset = Dataset.from_list(test_dataset)

# 결과 확인
print(f"\n전체 데이터 분할 결과: Train {len(train_dataset)}개, Test {len(test_dataset)}개")


전체 데이터 분할 결과: Train 308개, Test 76개


In [29]:
def to_chatml(data):
    """
    data : messages 리스트이거나 {"messages" : [...]} 형태의 dict
    반환값 : ChatML 포맷의 문자열
    """

    messages = data.get("messages") if isinstance(data, dict) and "messages" in data else data

    parts = []
    for msg in messages:
        role = msg["role"]
        content = msg["content"]
        parts.append(f"<|im_start|>{role}\n{content}<|im_end|>")

    return "\n".join(parts)

In [34]:
def extract_examples(chatml: str) -> List[Dict[str, str]]:
    """
    ChatML 문자열에 각 assistant 응답을 분리하여
    'input'과 'label' 쌍을 생성합니다.
    'input'은 해당 assistant 응답 직전까지의 모든 대화 + '<|im_start|>assistant',
    'label'은 해당 assistant의 응답 내용입니다.
    """
    examples: List[Dict[str, str]] = []
    pattern = re.compile(r'<\|im_start\|>assistant(.*?)(?=<\|im_end\|>)', re.DOTALL)


    for match in pattern.finditer(chatml):
        start_idx = match.start()
        input_text = chatml[:start_idx].strip() + '\n<|im_start|>assistant'
        label_text = match.group(1).strip()
        examples.append({
            "input": input_text,
            "label": label_text
        })

    return examples

In [35]:
prompt_lst = []
label_lst = []

for item in test_dataset:
    chatml = to_chatml(item)
    examples = extract_examples(chatml)

    for ex in examples:
        prompt_lst.append(ex['input'])
        label_lst.append(ex['label'])

In [36]:
print(prompt_lst[10])

<|im_start|>system
당신은 가상 자산 챗봇 상담사입니다. 성심성의껏 상담하십시오.

로그인한 사용자의 현재 ID: U002
오늘 날짜: 2025-07-26

# Tools

You may call one or more functions to assist with the user query.

You are provided with function signatures within <tools></tools> XML tags:
<tools>
{"type": "function", "function": {"name": "get_trending_coins", "description": "Get trending coins", "parameters": {"type": "object", "properties": {}}}}
{"type": "function", "function": {"name": "get_crypto_news", "description": "Search latest crypto news from RAG DB", "parameters": {"type": "object", "properties": {"query": {"type": "string"}, "top_k": {"type": "integer", "default": 5}}, "required": ["query"]}}}
{"type": "function", "function": {"name": "get_latest_strategy", "description": "Fetch the latest N trading strategy records", "parameters": {"type": "object", "properties": {"userid": {"type": "string"}, "limit": {"type": "integer", "default": 5}}, "required": ["userid"]}}}
{"type": "function", "function": {"name": "get_mark

In [37]:
print(label_lst[10])

<tool_call>
{"name": "search_crypto_term", "arguments": {"query": "샤프 지수", "top_k": 1}}
</tool_call>


# 2. 모델 호출

In [38]:
sampling_params = SamplingParams(
    temperature = 0,
    max_tokens = 2048,
    stop = ["<|im_end|>"]
)

In [39]:
llm = LLM(model = "HJUNN/Qwen2.5-7B-Instruct-crypto-function-calling")

config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


INFO 12-23 14:08:54 [config.py:585] This model supports multiple tasks: {'score', 'generate', 'classify', 'reward', 'embed'}. Defaulting to 'generate'.
INFO 12-23 14:08:54 [llm_engine.py:241] Initializing a V0 LLM engine (v0.8.2) with config: model='HJUNN/Qwen2.5-7B-Instruct-crypto-function-calling', speculative_config=None, tokenizer='HJUNN/Qwen2.5-7B-Instruct-crypto-function-calling', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=F

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

INFO 12-23 14:09:02 [cuda.py:239] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 12-23 14:09:02 [cuda.py:288] Using XFormers backend.
INFO 12-23 14:09:03 [parallel_state.py:954] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 12-23 14:09:03 [model_runner.py:1110] Starting to load model HJUNN/Qwen2.5-7B-Instruct-crypto-function-calling...
INFO 12-23 14:09:04 [weight_utils.py:265] Using model weights format ['*.safetensors']


model-00001-of-00004.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.33G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

INFO 12-23 14:11:01 [weight_utils.py:281] Time spent downloading weights for HJUNN/Qwen2.5-7B-Instruct-crypto-function-calling: 116.837685 seconds


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 12-23 14:12:05 [loader.py:447] Loading weights took 64.19 seconds
INFO 12-23 14:12:07 [model_runner.py:1146] Model loading took 14.2487 GB and 182.135971 seconds


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 106.12 MiB is free. Process 26723 has 14.63 GiB memory in use. Of the allocated memory 14.47 GiB is allocated by PyTorch, and 54.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
fine_tuned_output = llm.generate(prompt_lst, sampling_params)

In [None]:
fine_tuned_text_results = [sample.outputs[0].text.strip() for sample in fine_tuned_output]

In [None]:
print(fine_tuned_text_results[10])

# 평가 결과 저장

In [None]:
df = pd.DataFrame({
    "prompt": prompt_lst,
    "label": label_lst,
    "output": fine_tuned_text_results
})

# 저장
df.to_csv("evaluation_results.csv", index=False, encoding="utf-8-sig")

In [None]:
df

In [None]:
for label, pred in zip(df['label'].to_list()[:50], df['output'].to_list()[:50]):
    print('레이블: ', label)
    print("--" * 50)
    print('모델의 예측: ', pred)
    print("--" * 50)

# 4. 평가

* tool_selection: 함수 이름의 일치 여부를 평가합니다.
* params_selection: 파라미터 키(예: user_id)의 일치 여부를 평가합니다.
* params_value_accuracy: 파라미터 값(예: "U002")의 일치 여부를 평가합니다.

In [None]:
def evaluate_function_calls(labels, predictions):
    """
    펑션 콜링 성능을 평가하는 함수

    Parameters:
    ------------
    labels : list
        정답 레이블 목록
    predictions : list
        모델이 예측한 결과 목록

    Returns:
    ------------
    dict
        tool_selections : 함수 이름 일치율
        params_selection: 파라미터 키 일치율
        params_value_accuracy: 파라미터 값 일치율
        total_samples: 전체 tool_call 샘플 수
    """

    # 결과 저장할 딕셔너리 초기화
    results = {
        'tool_selection': {'correct': 0, 'total': 0},
        'params_selection': {'correct': 0, 'total': 0},
        'params_value_accuracy': {'correct': 0, 'total': 0}
    }

    # tool_call 형식만 필터링하기 위한 정규표현식
    tool_call_pattern = re.compile(r'<tool_call>(.*?)</tool_call>', re.DOTALL)

    # 전체 샘플 중 tool_call 샘플 수
    tool_call_count = 0

    for label, pred in zip(labels, predictions):
        # tool_call 형식인지 확인
        label_match = tool_call_pattern.search(label)
        pred_match = tool_call_pattern.search(pred)

        #레이블이 tool_call이 아니면 건너뛰기
        if not label_match:
            continue

        tool_call_count += 1

        # 예측이 tool_call 형식이 아니면 모든 지표가 틀린 것으로 처리
        if not pred_match:
            results['tool_selection']['total'] += 1
            results['params_selection']['total'] += 1
            results['params_value_accuracy']['total'] += 1
            continue

        # JSON 파싱
        try:
            label_json = json.loads(label_match.group(1))
            pred_json = json.loads(pred_match.group(1))
        except json.JSONDecodeError:
            # JSON 파싱 오류 시 모든 지표가 틀린 것으로 처리
            results['tool_selection']['total'] += 1
            results['params_selection']['total'] += 1
            results['params_value_accurcay']['total'] += 1
            continue

        # 1. 함수 이름 일치 여부 (tool_selection)
        results['tool_selection']['total'] += 1
        if label_json.get('name') == pred_json.get('name'):
            results['tool_selection']['correct'] += 1

        # 2. 파라미터 키 일치 여부(params_selection)
        # 개별 파라미터별로 맞고 틀림을 채점
        label_params = set(label_json.get('arguments', {}).keys())
        pred_params = set(pred_json.get('arguments', {}).keys())

        # 각 파라미터마다 평가를 위해 모든 파라미터 순회
        for param in label_params:
            results['params_selection']['total'] += 1
            if param in pred_params:
                results['params_selection']['correct'] += 1

        # 예측에만 있는 추가 파라미터도 틀린 것으로 평가
        for param in pred_params:
            if param not in label_params:
                results['params_selection']['total'] += 1
                # correct는 증가 안함

        # 3. 파라미터 값 일치 여부(params_value_accuracy)
        # 존재하는 공통 파라미터에 대해서만 값 일치 여부 평가
        label_args = label_json.get('arguments', {})
        pred_args = pred_json.get('arguments', {})

        # 공통으로 존재하는 파라미터 키 찾기
        common_params = label_params.intersection(pred_params)

        if common_params:
            results['params_value_accuracy']['total'] += 1

            # 공통 파라미터 값이 모두 일치하는지 확인
            values_match = True
            for key in common_params:
                if label_args.get(key) != pred_args.get(key):
                    values_match = False
                    break

            if values_match:
                results['params_value_accuracy']['correct'] += 1

    # 최종 결과 계산
    final_results = {}
    for metric, counts in results.items():
        if counts['total'] > 0:
            final_results[metric] = counts['correct'] / counts['total']
        else:
            final_results[metric] =  0.0

    final_results['total_samples'] = tool_call_count

    return final_results

In [1]:
# BFCL 확장 평가
import re
import json

def evaluate_function_calls_bfcl(labels, predictions):
    """
    BFCL 스타일 function calling 평가 함수
    """

    results = {
        "tool_selection": {"correct": 0, "total": 0},
        "params_selection": {"correct": 0, "total": 0},
        "params_value_accuracy": {"correct": 0.0, "total": 0},
        "hallucinated_args": {"count": 0},
        "over_call": {"count": 0},
        "under_call": {"count": 0},
        "format_error": {"count": 0}
    }

    tool_call_pattern = re.compile(r"<tool_call>(.*?)</tool_call>", re.DOTALL)
    tool_call_count = 0

    for label, pred in zip(labels, predictions):

        label_match = tool_call_pattern.search(label)
        pred_match = tool_call_pattern.search(pred)

        # ---------- BFCL: over-call / under-call ----------
        if label_match and not pred_match:
            results["under_call"]["count"] += 1
            continue

        if not label_match and pred_match:
            results["over_call"]["count"] += 1
            continue

        if not label_match:
            continue

        tool_call_count += 1

        # ---------- format robustness ----------
        try:
            label_json = json.loads(label_match.group(1))
            pred_json = json.loads(pred_match.group(1))
        except Exception:
            results["format_error"]["count"] += 1
            continue

        # ---------- 1. Tool selection ----------
        results["tool_selection"]["total"] += 1
        if label_json.get("name") == pred_json.get("name"):
            results["tool_selection"]["correct"] += 1

        # ---------- 2. Parameter selection ----------
        label_params = set(label_json.get("arguments", {}).keys())
        pred_params = set(pred_json.get("arguments", {}).keys())

        # 정답 파라미터 기준 평가
        for p in label_params:
            results["params_selection"]["total"] += 1
            if p in pred_params:
                results["params_selection"]["correct"] += 1

        # 환각 파라미터
        hallucinated = pred_params - label_params
        results["hallucinated_args"]["count"] += len(hallucinated)
        results["params_selection"]["total"] += len(hallucinated)

        # ---------- 3. Parameter value accuracy (partial scoring) ----------
        common_params = label_params & pred_params
        if common_params:
            match_cnt = 0
            for k in common_params:
                if label_json["arguments"][k] == pred_json["arguments"][k]:
                    match_cnt += 1

            results["params_value_accuracy"]["correct"] += (
                match_cnt / len(common_params)
            )
            results["params_value_accuracy"]["total"] += 1

    # ---------- 최종 BFCL-style 지표 ----------
    final_results = {
        "tool_accuracy": (
            results["tool_selection"]["correct"] /
            results["tool_selection"]["total"]
            if results["tool_selection"]["total"] > 0 else 0.0
        ),
        "param_key_accuracy": (
            results["params_selection"]["correct"] /
            results["params_selection"]["total"]
            if results["params_selection"]["total"] > 0 else 0.0
        ),
        "param_value_accuracy": (
            results["params_value_accuracy"]["correct"] /
            results["params_value_accuracy"]["total"]
            if results["params_value_accuracy"]["total"] > 0 else 0.0
        ),
        "hallucination_rate": (
            results["hallucinated_args"]["count"] / tool_call_count
            if tool_call_count > 0 else 0.0
        ),
        "over_call_rate": (
            results["over_call"]["count"] / len(labels)
        ),
        "under_call_rate": (
            results["under_call"]["count"] / len(labels)
        ),
        "format_error_rate": (
            results["format_error"]["count"] / tool_call_count
            if tool_call_count > 0 else 0.0
        ),
        "total_tool_calls": tool_call_count
    }

    return final_results


In [None]:
labels = df['label'].to_list()
preds = df['output'].to_list()

results_with_errors = evaluate_function_calls(labels, preds)
for metric, value in results_with_errors.items():
    if metric != 'total_samples':
        print(f"{metric}: {value:.2%}")
    else:
        print(f"{metric}: {value}")

In [None]:
labels = df["label"].tolist()
preds = df["output"].tolist()

results = evaluate_function_calls_bfcl(labels, preds)

for metric, value in results.items():
    if isinstance(value, float):
        print(f"{metric}: {value:.2%}")
    else:
        print(f"{metric}: {value}")
