### 导入相关包

In [3]:
import jieba

### 评价指标计算函数

In [None]:
def calculate_metrics(references, candidates):
    precisions = []
    recalls = []
    f1_scores = []

    for reference, candidate in zip(references, candidates):
        # 使用jieba分词
        ref_tokens = jieba.lcut(reference)
        can_tokens = jieba.lcut(candidate)

        # 计算正确的token数量（交集）
        correct_tokens = set(ref_tokens) & set(can_tokens)
        num_correct = len(correct_tokens)

        # 计算精确率Precision
        precision = num_correct / len(can_tokens) if can_tokens else 0
        precisions.append(precision)

        # 计算召回率Recall
        recall = num_correct / len(ref_tokens) if ref_tokens else 0
        recalls.append(recall)

        # 计算F1值
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1)

    # 计算平均Precision、Recall、F1
    avg_precision = sum(precisions) / len(precisions) if precisions else 0
    avg_recall = sum(recalls) / len(recalls) if recalls else 0
    avg_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0

    return avg_precision, avg_recall, avg_f1



In [None]:
# 多条标准答案列表
references = ["我爱北京天安门", "香蕉是黄色水果"]
# 多条模型回答列表
candidates = ["我爱北京天安门，它很壮观", "香蕉是黄色水果，它富含钾"]

avg_precision, avg_recall, avg_f1 = calculate_metrics(references, candidates)
print(f"Average Precision: {avg_precision}")
print(f"Average Recall: {avg_recall}")
print(f"Average F1: {avg_f1}")