In [40]:
import os
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import recall_score, f1_score, precision_score, accuracy_score

from settings import NAME2IDX

In [4]:
def trans_label(item):
    pred_label = item["pred_label"]
    pred_int_label = NAME2IDX.get(pred_label, -1)
    item["pred_int_label"] = pred_int_label
    return item

In [26]:
def get_dataset(file: str):
    # 从json文件加载数据集
    dataset = load_dataset(
        "json",
        data_files=file,
        split="train",
    )
    new_dataset = dataset.map(trans_label)
    return new_dataset.filter(lambda x: x["pred_int_label"] != -1)

In [42]:
def eval_dataset_func(dataset):
    label_name = "labels"
    if "label" in dataset.features:
        label_name = "label"
    # 对数据集进行评估
    recall = recall_score(
        y_pred=dataset["pred_int_label"], y_true=dataset[label_name], average="weighted"
    )
    precision = precision_score(
        y_pred=dataset["pred_int_label"], y_true=dataset[label_name], average="weighted"
    )
    f1 = f1_score(
        y_pred=dataset["pred_int_label"], y_true=dataset[label_name], average="weighted"
    )
    return {"precision": precision, "recall": recall, "f1": f1, "support": len(dataset)}

In [37]:
eval_dataset_func(get_dataset("output/llm_struct.json"))

{'precision': 0.8382126577315632,
 'recall': 0.8088088088088088,
 'f1': 0.8037425149032076}

In [44]:
data = []

for file in os.listdir(fold := "output"):
    file_name = os.path.join(fold, file)
    method = file.split(".")[0]
    tmp_data = {
        "method": method,
    }
    tmp_eval_data = eval_dataset_func(get_dataset(file_name))
    tmp_data.update(tmp_eval_data)
    data.append(tmp_data)

ans_df = pd.DataFrame(data)
ans_df

Unnamed: 0,method,precision,recall,f1,support
0,llm,0.841183,0.807035,0.794224,995
1,vllm_infer,0.841379,0.80981,0.797484,999
2,llm_struct,0.838213,0.808809,0.803743,999


In [None]:
# 手动添加每种大模型推理方法的运行时间
processing_time = [26 * 60 + 40, 41, 22 * 60 + 19]

In [46]:
ans_df["processing_time"] = processing_time

In [47]:
ans_df

Unnamed: 0,method,precision,recall,f1,support,processing_time
0,llm,0.841183,0.807035,0.794224,995,1600
1,vllm_infer,0.841379,0.80981,0.797484,999,41
2,llm_struct,0.838213,0.808809,0.803743,999,1339
