In [2]:
from pathlib import Path
import json

reference_folder = Path("../source/競賽資料集/reference_text")
question_json_path = Path("../source/競賽資料集/dataset/preliminary/questions_example.json")

groundtruth_json_path = Path("../source/競賽資料集/dataset/preliminary/ground_truths_example.json")

with open(question_json_path, "r") as f:
    questions = json.load(f)["questions"]

with open(groundtruth_json_path, "r") as f:
    groundtruths = json.load(f)["ground_truths"]

answer_json_paths = [
    path for path in sorted(Path("../output/v6").glob("answer_v*.json"))
]

answer_jsons = {}
for answer_json_path in answer_json_paths:
    with open(answer_json_path, "r") as f:
        answer_jsons[answer_json_path.stem] = json.load(f)["answers"]

len(answer_jsons)


7

In [3]:
errors = {}
for answer_json_path, answers in answer_jsons.items():
    errors[answer_json_path] = []
    for question, answer, groundtruth in zip(questions, answers, groundtruths):
        assert question["qid"] == answer["qid"] == groundtruth["qid"]
        if answer["retrieve"] != groundtruth["retrieve"]:
            errors[answer_json_path].append(f'{question["category"]}_{question["qid"]}')

for answer_json_path, error_qids in errors.items():
    print(f'{answer_json_path}: {len(error_qids)}')

# 計算所有 error_qids 的交集
error_sets = [set(error_qids) for error_qids in errors.values()]
intersection_errors = set.intersection(*error_sets) if error_sets else set()
union_errors = set.union(*error_sets) if error_sets else set()

print(f'intersection error QIDs: {intersection_errors}')
print(f'union error QIDs: {union_errors}')
print(f'union error QIDs count: {len(union_errors)}')


answer_v6_clean_shallow_fusion_ckip_bge-large-zh-v1.5_128_64_alpha_0.01_beta_0.24: 14
answer_v6_clean_shallow_fusion_ckip_bge-large-zh-v1.5_256_128_alpha_0.02_beta_0.23: 15
answer_v6_clean_shallow_fusion_ckip_bge-large-zh-v1.5_500_128_alpha_0.03_beta_0.19: 15
answer_v6_clean_shallow_fusion_ckip_bge-m3_128_64_alpha_0.04_beta_0.24: 13
answer_v6_clean_shallow_fusion_ckip_bge-m3_256_128_alpha_0.03_beta_0.25: 11
answer_v6_clean_shallow_fusion_ckip_bge-m3_4096_3072_alpha_0.06_beta_0.19: 13
answer_v6_clean_shallow_fusion_ckip_bge-m3_512_128_alpha_0.11_beta_0.02: 15
intersection error QIDs: {'finance_99', 'faq_109', 'faq_135', 'finance_86', 'finance_59', 'insurance_50'}
union error QIDs: {'insurance_4', 'faq_135', 'insurance_11', 'finance_79', 'insurance_19', 'faq_109', 'finance_94', 'insurance_35', 'finance_86', 'insurance_3', 'finance_75', 'insurance_2', 'finance_93', 'insurance_29', 'insurance_6', 'finance_72', 'finance_51', 'finance_98', 'finance_82', 'insurance_40', 'finance_61', 'finance

In [4]:
# 找出兩兩比較差異最大的 error_qids
error_sets = {path: set(qids) for path, qids in errors.items()}
max_diff = 0
max_pair = (None, None)
max_diff_set = set()

for path1, set1 in error_sets.items():
    for path2, set2 in error_sets.items():
        if path1 != path2:
            diff_set = set1.symmetric_difference(set2)
            if len(diff_set) > max_diff:
                max_diff = len(diff_set)
                max_pair = (path1, path2)
                max_diff_set = diff_set

print(f'Max difference between {max_pair[0]} and {max_pair[1]}: {max_diff}')
print(f'Difference QIDs: {max_diff_set}')

Max difference between answer_v6_clean_shallow_fusion_ckip_bge-large-zh-v1.5_256_128_alpha_0.02_beta_0.23 and answer_v6_clean_shallow_fusion_ckip_bge-m3_512_128_alpha_0.11_beta_0.02: 14
Difference QIDs: {'insurance_2', 'insurance_6', 'insurance_4', 'insurance_11', 'finance_51', 'insurance_19', 'insurance_40', 'finance_53', 'insurance_35', 'finance_94', 'finance_62', 'finance_73', 'finance_66', 'finance_75'}


In [5]:
print(len(set.union(set(errors['answer_v2_bm25_ckip']), set(errors['answer_v3_clean_embedding_bge-m3_4096_3072']))))
print(len(set.intersection(set(errors['answer_v2_bm25_ckip']), set(errors['answer_v3_clean_embedding_bge-m3_4096_3072']))))


KeyError: 'answer_v2_bm25_ckip'

In [12]:
for path1, set1 in error_sets.items():
    for path2, set2 in error_sets.items():
        if path1 != path2:
            if not ('answer_v2' in path1 and 'answer_v5' in path2):
                continue
            print(f'compare {path1} and {path2}')

            set1_unique_errors = set1 - set2
            print(f'{path1} unique errors: {len(set1_unique_errors)}')
            print(f'Unique QIDs in {path1}: {set1_unique_errors}')
            set2_unique_errors = set2 - set1
            print(f'{path2} unique errors: {len(set2_unique_errors)}')
            print(f'Unique QIDs in {path2}: {set2_unique_errors}')

            # 計算共同的錯誤
            common_errors = set1 & set2
            common_errors = set.intersection(set1, set2)
            print(f'Common errors: {len(common_errors)}')
            print(f'Common QIDs: {common_errors}')

            diff_set = set1.symmetric_difference(set2)

            print(f'difference between {path1} and {path2}: {len(diff_set)}')
            print(f'Difference QIDs: {diff_set}')

            print('========================================')
            print()


compare answer_v2_bm25_ckip and answer_v5_clean_shallow_fusion_ckip_bge-large-zh-v1.5_500_128_alpha_0.03
answer_v2_bm25_ckip unique errors: 16
Unique QIDs in answer_v2_bm25_ckip: {97, 66, 3, 68, 6, 40, 10, 12, 76, 144, 124, 82, 24, 89, 58, 92}
answer_v5_clean_shallow_fusion_ckip_bge-large-zh-v1.5_500_128_alpha_0.03 unique errors: 9
Unique QIDs in answer_v5_clean_shallow_fusion_ckip_bge-large-zh-v1.5_500_128_alpha_0.03: {2, 67, 69, 37, 75, 77, 79, 53, 93}
Common errors: 16
Common QIDs: {98, 35, 99, 135, 72, 73, 109, 50, 19, 51, 61, 86, 94, 59, 29, 62}
difference between answer_v2_bm25_ckip and answer_v5_clean_shallow_fusion_ckip_bge-large-zh-v1.5_500_128_alpha_0.03: 25
Difference QIDs: {2, 3, 6, 10, 12, 144, 24, 37, 40, 53, 58, 66, 67, 68, 69, 75, 76, 77, 79, 82, 89, 92, 93, 97, 124}

compare answer_v2_bm25_ckip and answer_v5_clean_shallow_fusion_ckip_bge-large-zh-v1.5_500_128_alpha_0.05
answer_v2_bm25_ckip unique errors: 17
Unique QIDs in answer_v2_bm25_ckip: {97, 66, 3, 68, 6, 40, 10,

In [6]:
f = open('answer_v6_clean_shallow_fusion_ckip_bge-m3_256_128_alpha_0.03_beta_0.25.txt', 'w')

answers = answer_jsons['answer_v6_clean_shallow_fusion_ckip_bge-m3_256_128_alpha_0.03_beta_0.25']

correct_count = 0
for question, answer, groundtruth in zip(questions, answers, groundtruths):
    assert question["qid"] == answer["qid"] == groundtruth["qid"]
    if answer["retrieve"] != groundtruth["retrieve"]:
        print('========================================')
        f.write('========================================\n')
        print(f'qid {question["qid"]} is wrong')
        f.write(f'qid {question["qid"]} is wrong\n')
        print(f'category: {question["category"]}, ground truth: {groundtruth["retrieve"]}, model output: {answer["retrieve"]}')
        f.write(f'category: {question["category"]}, ground truth: {groundtruth["retrieve"]}, model output: {answer["retrieve"]}\n')
        category = question["category"]
        print(f'---------- query ----------')
        f.write(f'---------- query ----------\n')
        print(question["query"])
        f.write(f'{question["query"]}\n')
        groundtruth_reference = reference_folder / f'{category}/{groundtruth["retrieve"]}.txt'
        print(f'---------- ground truth reference {category}/{groundtruth["retrieve"]} ----------')
        f.write(f'---------- ground truth reference {category}/{groundtruth["retrieve"]} ----------\n')
        with open(groundtruth_reference, "r") as fp:
            groundtruth_reference_text = fp.read()
            print(groundtruth_reference_text)
            f.write(groundtruth_reference_text)
        answer_reference = reference_folder / f'{category}/{answer["retrieve"]}.txt'
        print(f'---------- answer reference {category}/{answer["retrieve"]} ----------')
        f.write(f'---------- answer reference {category}/{answer["retrieve"]} ----------\n')
        with open(answer_reference, "r") as fp:
            answer_reference_text = fp.read()
            print(answer_reference_text)
            f.write(answer_reference_text)
    else:
        correct_count += 1

f.close()

print(f'correct count: {correct_count}')


qid 2 is wrong
category: insurance, ground truth: 428, model output: 258
---------- query ----------
本公司應在效力停止日前多少天以書面通知要保人？
---------- ground truth reference insurance/428 ----------
**page 0**
南山人壽威美鑽美元利率變動型終身壽險（定期給付型）_SYUL
一、因可歸責於本公司之錯誤原因，致本公司依第三十條第二項約定為退還或給
付所生之相關匯款費用。
二、因可歸責於本公司之錯誤原因，要保人或受益人依第三十條第二項約定為補
繳或返還所生之相關匯款費用。
三、因本公司提供之匯款帳戶錯誤而使要保人或受益人匯款無法完成時所生之相
關匯款費用。
要保人或受益人若選擇以本公司指定銀行之外匯存款戶交付相關款項且匯款銀
行及收款銀行為同一銀行時，或以本公司指定銀行之外匯存款戶受領相關款項
時，其所有匯款相關費用均由本公司負擔，不適用前項約定。
本公司指定銀行之相關訊息可至本公司網站（網址：http://www.nanshanlife.
com.tw）查詢。
第二十八條 保險單借款及契約效力的停止
於本契約「保障期間」內，要保人得向本公司申請保險單借款，其可借金額上限
為借款當日保單價值準備金之一定百分比，其比率請詳附表四，未償還之借款本
息，超過其保單價值準備金時，本契約效力即行停止。但本公司應於效力停止日
之三十日前以書面通知要保人。
本公司未依前項規定為通知時，於本公司以書面通知要保人返還借款本息之日起
三十日內要保人未返還者，保險契約之效力自該三十日之次日起停止。
第二十九條 不分紅保單
本保險為不分紅保單，不參加紅利分配，並無紅利給付項目。
第三十條 投保年齡的計算及錯誤的處理
要保人在申請投保時，應將被保險人出生年月日在要保書填明。被保險人的投保
年齡，以足歲計算，但未滿一歲的零數超過六個月者，加算一歲。
被保險人的投保年齡發生錯誤時，就「基本保額」對應部分依下列規定辦理；就
「增值回饋分享金」對應部分，本公司重新計算依第十四條約定應給付之金額：
一、真實投保年齡較本公司保險費率表所載最高年齡為大者，本契約無效，其已
繳保險費無息退還要保人。
二、因投保年齡的錯誤，而致溢繳保險費者

In [17]:
131/150

0.8733333333333333