In [6]:
from pathlib import Path
import json

reference_folder = Path("../source/競賽資料集/reference_text")
question_json_path = Path("../source/競賽資料集/dataset/preliminary/questions_example.json")

groundtruth_json_path = Path("../source/競賽資料集/dataset/preliminary/ground_truths_example.json")

with open(question_json_path, "r") as f:
    questions = json.load(f)["questions"]

with open(groundtruth_json_path, "r") as f:
    groundtruths = json.load(f)["ground_truths"]

answer_json_paths = [
    path for path in sorted(Path("../output").glob("answer_v*.json"))
]

answer_jsons = {}
for answer_json_path in answer_json_paths:
    with open(answer_json_path, "r") as f:
        answer_jsons[answer_json_path.stem] = json.load(f)["answers"]

len(answer_jsons)


9

In [29]:
errors = {}
for answer_json_path, answers in answer_jsons.items():
    errors[answer_json_path] = []
    for question, answer, groundtruth in zip(questions, answers, groundtruths):
        assert question["qid"] == answer["qid"] == groundtruth["qid"]
        if answer["retrieve"] != groundtruth["retrieve"]:
            errors[answer_json_path].append(question["qid"])

for answer_json_path, error_qids in errors.items():
    print(f'{answer_json_path}: {len(error_qids)}')

# 計算所有 error_qids 的交集
error_sets = [set(error_qids) for error_qids in errors.values()]
intersection_errors = set.intersection(*error_sets) if error_sets else set()
union_errors = set.union(*error_sets) if error_sets else set()

print(f'intersection error QIDs: {intersection_errors}')
print(f'union error QIDs: {len(union_errors)}')


answer_v2_bm25_ckip: 32
answer_v2_bm25_clean_ckip: 29
answer_v2_bm25_clean_jieba: 34
answer_v2_bm25_jieba: 41
answer_v3_clean_embedding_bge-large-zh-v1.5_500_128: 29
answer_v3_clean_embedding_bge-m3_4096_3072: 37
answer_v3_clean_embedding_bge-m3_512_128: 31
answer_v3_clean_sparse_embedding_bge-m3_4096_3072: 31
answer_v4_clean_rerank_bge-reranker-v2-m3_2048_512: 31
intersection error QIDs: {135, 109, 50, 86, 61}
union error QIDs: 76


In [30]:
# 找出兩兩比較差異最大的 error_qids
error_sets = {path: set(qids) for path, qids in errors.items()}
max_diff = 0
max_pair = (None, None)
max_diff_set = set()

for path1, set1 in error_sets.items():
    for path2, set2 in error_sets.items():
        if path1 != path2:
            diff_set = set1.symmetric_difference(set2)
            if len(diff_set) > max_diff:
                max_diff = len(diff_set)
                max_pair = (path1, path2)
                max_diff_set = diff_set

print(f'Max difference between {max_pair[0]} and {max_pair[1]}: {max_diff}')
print(f'Difference QIDs: {max_diff_set}')

Max difference between answer_v2_bm25_ckip and answer_v3_clean_embedding_bge-m3_4096_3072: 45
Difference QIDs: {2, 4, 6, 8, 10, 11, 12, 15, 16, 144, 19, 23, 30, 38, 40, 42, 43, 47, 49, 51, 53, 58, 59, 64, 66, 67, 68, 69, 70, 72, 73, 75, 76, 79, 89, 90, 92, 93, 94, 95, 96, 97, 98, 100, 124}


In [31]:
print(len(set.union(set(errors['answer_v2_bm25_ckip']), set(errors['answer_v3_clean_embedding_bge-m3_4096_3072']))))
print(len(set.intersection(set(errors['answer_v2_bm25_ckip']), set(errors['answer_v3_clean_embedding_bge-m3_4096_3072']))))


57
12


In [39]:
for path1, set1 in error_sets.items():
    for path2, set2 in error_sets.items():
        if path1 != path2:
            if not ('answer_v2' in path1 and 'answer_v3' in path2):
                continue
            print(f'compare {path1} and {path2}')
            diff_set = set1.symmetric_difference(set2)

            print(f'difference between {path1} and {path2}: {len(diff_set)}')
            print(f'Difference QIDs: {diff_set}')

            print('========================================')
            print()


compare answer_v2_bm25_ckip and answer_v3_clean_embedding_bge-large-zh-v1.5_500_128
difference between answer_v2_bm25_ckip and answer_v3_clean_embedding_bge-large-zh-v1.5_500_128: 29
Difference QIDs: {2, 3, 6, 10, 12, 15, 144, 24, 37, 40, 53, 55, 58, 66, 67, 68, 69, 75, 76, 77, 79, 82, 89, 90, 91, 92, 93, 97, 124}

compare answer_v2_bm25_ckip and answer_v3_clean_embedding_bge-m3_4096_3072
difference between answer_v2_bm25_ckip and answer_v3_clean_embedding_bge-m3_4096_3072: 45
Difference QIDs: {2, 4, 6, 8, 10, 11, 12, 15, 16, 144, 19, 23, 30, 38, 40, 42, 43, 47, 49, 51, 53, 58, 59, 64, 66, 67, 68, 69, 70, 72, 73, 75, 76, 79, 89, 90, 92, 93, 94, 95, 96, 97, 98, 100, 124}

compare answer_v2_bm25_ckip and answer_v3_clean_embedding_bge-m3_512_128
difference between answer_v2_bm25_ckip and answer_v3_clean_embedding_bge-m3_512_128: 35
Difference QIDs: {2, 3, 4, 10, 11, 12, 15, 24, 37, 40, 46, 51, 53, 58, 64, 66, 67, 68, 69, 70, 72, 73, 75, 79, 89, 90, 92, 93, 94, 95, 97, 98, 100, 124, 144}



In [2]:
correct_count = 0
for question, answer, groundtruth in zip(questions, answers, groundtruths):
    assert question["qid"] == answer["qid"] == groundtruth["qid"]
    if answer["retrieve"] != groundtruth["retrieve"]:
        print('========================================')
        print(f'qid {question["qid"]} is wrong')
        print(f'category: {question["category"]}, ground truth: {groundtruth["retrieve"]}, model output: {answer["retrieve"]}')
        category = question["category"]
        print(f'---------- query ----------')
        print(question["query"])
        groundtruth_reference = reference_folder / f'{category}/{groundtruth["retrieve"]}.txt'
        print(f'---------- ground truth reference {category}/{answer["retrieve"]} ----------')
        with open(groundtruth_reference, "r") as f:
            print(f.read())
        answer_reference = reference_folder / f'{category}/{answer["retrieve"]}.txt'
        print(f'---------- answer reference {category}/{answer["retrieve"]} ----------')
        with open(answer_reference, "r") as f:
            print(f.read())
    else:
        correct_count += 1




qid 2 is wrong
category: insurance, ground truth: 428, model output: 258
---------- query ----------
本公司應在效力停止日前多少天以書面通知要保人？
---------- ground truth reference insurance/258 ----------
**page 0**
南山人壽威美鑽美元利率變動型終身壽險（定期給付型）_SYUL
一、因可歸責於本公司之錯誤原因，致本公司依第三十條第二項約定為退還或給
付所生之相關匯款費用。
二、因可歸責於本公司之錯誤原因，要保人或受益人依第三十條第二項約定為補
繳或返還所生之相關匯款費用。
三、因本公司提供之匯款帳戶錯誤而使要保人或受益人匯款無法完成時所生之相
關匯款費用。
要保人或受益人若選擇以本公司指定銀行之外匯存款戶交付相關款項且匯款銀
行及收款銀行為同一銀行時，或以本公司指定銀行之外匯存款戶受領相關款項
時，其所有匯款相關費用均由本公司負擔，不適用前項約定。
本公司指定銀行之相關訊息可至本公司網站（網址：http://www.nanshanlife.
com.tw）查詢。
第二十八條 保險單借款及契約效力的停止
於本契約「保障期間」內，要保人得向本公司申請保險單借款，其可借金額上限
為借款當日保單價值準備金之一定百分比，其比率請詳附表四，未償還之借款本
息，超過其保單價值準備金時，本契約效力即行停止。但本公司應於效力停止日
之三十日前以書面通知要保人。
本公司未依前項規定為通知時，於本公司以書面通知要保人返還借款本息之日起
三十日內要保人未返還者，保險契約之效力自該三十日之次日起停止。
第二十九條 不分紅保單
本保險為不分紅保單，不參加紅利分配，並無紅利給付項目。
第三十條 投保年齡的計算及錯誤的處理
要保人在申請投保時，應將被保險人出生年月日在要保書填明。被保險人的投保
年齡，以足歲計算，但未滿一歲的零數超過六個月者，加算一歲。
被保險人的投保年齡發生錯誤時，就「基本保額」對應部分依下列規定辦理；就
「增值回饋分享金」對應部分，本公司重新計算依第十四條約定應給付之金額：
一、真實投保年齡較本公司保險費率表所載最高年齡為大者，本契約無效，其已
繳保險費無息退還要保人。
二、因投保年齡的錯誤，而致溢繳保險費者