In [3]:
import json

def parse_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data

path = "data/LongDocURL_public.jsonl"
data = parse_jsonl(path)

ori_dict = {}
for item in data:
    ori_dict[item["question_id"]+".json"] = item


In [2]:
data[0]['question_id']

'free_gpt4o_4026369_60_70_12'

In [3]:
import os

results_list = os.listdir("retrieval_results/copali_results_num_workers16/")

In [6]:
total_entries = len(data)
total_recall_at_n = [0, 0, 0, 0]
N = [5, 10, 20, 30]

for result_file in results_list:
    with open(f"retrieval_results/copali_results_num_workers16/{result_file}", 'r') as file:
        results = json.load(file)
        recall_at_n = results['recall_at_n']
        for i, n in enumerate(N):
            total_recall_at_n[i] += recall_at_n[str(n)]

total_recall_at_n = [round(x / len(results_list), 4) for x in total_recall_at_n]

In [7]:
total_recall_at_n

[0.7426, 0.8263, 0.8953, 0.9323]

In [6]:
import os

results_list = os.listdir("retrieval_results/copali_results_num_workers16/")

# 添加task_tag和question_type到每个文件里
for result in results_list:
    with open(f"retrieval_results/copali_results_num_workers16/{result}", 'r') as file:
        result_one = json.load(file)
        result_one["question_type"] = ori_dict[result]['question_type']
        result_one["task_tag"] = ori_dict[result]['task_tag']
        result_one["evidence_sources"] = ori_dict[result]['evidence_sources']
        result_one["evidence_pages"] = ori_dict[result]['evidence_pages']
    with open(f"retrieval_results/copali_results_num_workers16_add/{result}", 'w') as file:
        json.dump(result_one, file, indent=4)

In [16]:
def calculate_accuracy_fine_grained(sample_files, score_dict, k='5'):
    samples = []
    for sample_file in sample_files:
        with open(f"retrieval_results/copali_results_num_workers16_add/{sample_file}", 'r') as file:
            samples.append(json.load(file))

    # Main_Task
    for sample in samples:
        score_dict["Main_Task"][sample["task_tag"]] += sample["recall_at_n"][k]
    
    # Element_Type
    for sample in samples:
        for evidence_source in sample["evidence_sources"]:
            if evidence_source in ["Text", "Layout", "Figure", "Table"]:
                score_dict["Element_Type"][evidence_source] += sample["recall_at_n"][k]

    # Evidence_Pages
    for sample in samples:
        if len(sample["evidence_pages"]) > 1:
            score_dict["Evidence_Pages"]["Multi_Page"] += sample["recall_at_n"][k]
        elif len(sample["evidence_pages"]) == 1:
            score_dict["Evidence_Pages"]["Single_Page"] += sample["recall_at_n"][k]

    # Num_of_Element_Types
    for sample in samples:
        if len(sample["evidence_sources"]) > 1:
            score_dict["Num_of_Element_Types"]["Cross_Element"] += sample["recall_at_n"][k]

    # Fine_Grained
    for sample in samples:
        sub_score_dict = score_dict["Fine_Grained"][sample["task_tag"]]
        if sample["task_tag"] in ["Understanding", "Reasoning"]:
            if len(sample["evidence_pages"]) > 1:
                sub_sub_score_dict = sub_score_dict["Multi_Page"]
            elif len(sample["evidence_pages"]) == 1:
                sub_sub_score_dict = sub_score_dict["Single_Page"]

            for evidence_source in sample["evidence_sources"]:
                if evidence_source in ["Text", "Layout", "Figure", "Table"]:
                    sub_sub_score_dict[evidence_source] += sample["recall_at_n"][k]

            if len(sample["evidence_pages"]) > 1:
                sub_score_dict["Multi_Page"] = sub_sub_score_dict
            elif len(sample["evidence_pages"]) == 1:
                sub_score_dict["Single_Page"] = sub_sub_score_dict

        elif sample["task_tag"] in ["Locating"]:
            sub_sub_score_dict = sub_score_dict["Cross_Element"]
            if sample["question_type"] == "topic2title":
                sub_sub_score_dict["Cross_Title"] += sample["recall_at_n"][k]
            elif sample["question_type"] == "summary2title":
                sub_sub_score_dict["Para_Title"] += sample["recall_at_n"][k]
            elif sample["question_type"] == "summary2tab":
                sub_sub_score_dict["Cross_Table"] += sample["recall_at_n"][k]
            elif sample["question_type"] == "extract_fig2tab":
                sub_sub_score_dict["Figure_Table"] += sample["recall_at_n"][k]
            
            sub_score_dict["Cross_Element"] = sub_sub_score_dict
        
        score_dict["Fine_Grained"][sample["task_tag"]] = sub_score_dict


    return score_dict

In [17]:
with open("evaluation_results/scores_sample_fine_grained.json", "r", encoding="utf-8") as rf:
    _ = json.load(rf)
    score_dict, sample_cnt_dict = _["scores"], _["sample_cnt"]
k = '10'
sample_files = os.listdir("retrieval_results/copali_results_num_workers16_add/")
score_dict = calculate_accuracy_fine_grained(sample_files, score_dict, k)

def generalize_score_dict(score_dict, sample_cnt_dict):
    for key, value in score_dict.items():
        if isinstance(value, dict):
            generalize_score_dict(value, sample_cnt_dict[key])
            score_dict[key] = value
        else:
            score_dict[key] /= sample_cnt_dict[key]

generalize_score_dict(score_dict, sample_cnt_dict)

print("--------------------------------------------------------------")
print(score_dict)
with open(f"evaluation_results/scores_fine_grained_colqwen2_recall{k}.json", "w", encoding="utf-8") as wf:
    json.dump({"scores": score_dict, "sample_cnt": sample_cnt_dict}, wf, indent=4)

--------------------------------------------------------------
{'Main_Task': {'Understanding': 0.8339786300613992, 'Reasoning': 0.8179922018499307, 'Locating': 0.8173495153457069}, 'Element_Type': {'Text': 0.8135364239592546, 'Layout': 0.7625264478809751, 'Figure': 0.8648138631951583, 'Table': 0.876292775345588}, 'Evidence_Pages': {'Single_Page': 0.939615736505032, 'Multi_Page': 0.7270376686565112}, 'Num_of_Element_Types': {'Cross_Element': 0.830697398835791}, 'Fine_Grained': {'Understanding': {'Single_Page': {'Text': 0.9459459459459459, 'Layout': 0.945054945054945, 'Figure': 0.9893617021276596, 'Table': 0.9467680608365019}, 'Multi_Page': {'Text': 0.7428777455301159, 'Layout': 0.6665635420337884, 'Figure': 0.7597062579821201, 'Table': 0.6834368530020704}}, 'Reasoning': {'Single_Page': {'Text': 0.925, 'Layout': 1.0, 'Figure': 1.0, 'Table': 0.9489795918367347}, 'Multi_Page': {'Text': 0.751511996102789, 'Layout': 0.7825315126050421, 'Figure': 0.73703081232493, 'Table': 0.6749628390932739}

In [18]:
import numpy as np

# 定义每个元素的得分和数量
data = {
    "Understanding": {
        "Single_Page": {
            "Text": {"score": 0.9459459459459459, "num": 259},
            "Layout": {"score": 0.945054945054945, "num": 91},
            "Figure": {"score": 0.9893617021276596, "num": 94},
            "Table": {"score": 0.9467680608365019, "num": 263}
        },
        "Multi_Page": {
            "Text": {"score": 0.7428777455301159, "num": 443},
            "Layout": {"score": 0.6665635420337884, "num": 172},
            "Figure": {"score": 0.7597062579821201, "num": 174},
            "Table": {"score": 0.6834368530020704, "num": 115}
        }
    },
    "Reasoning": {
        "Single_Page": {
            "Text": {"score": 0.925, "num": 40},
            "Layout": {"score": 1.0, "num": 12},
            "Figure": {"score": 1.0, "num": 28},
            "Table": {"score": 0.9489795918367347, "num": 98}
        },
        "Multi_Page": {
            "Text": {"score": 0.751511996102789, "num": 115},
            "Layout": {"score": 0.7825315126050421, "num": 40},
            "Figure": {"score": 0.73703081232493, "num": 85},
            "Table": {"score": 0.6749628390932739, "num": 69}
        }
    },
    "Locating": {
        "Cross_Element": {
            "Cross_Title": {"score": 0.6708362947168919, "num": 201},
            "Cross_Table": {"score": 0.8260582010582012, "num": 126},
            "Para_Title": {"score": 0.8112152160134125, "num": 137},
            "Figure_Table": {"score": 0.9437229437229437, "num": 231}
        }
    }
}

# 计算加权平均的函数
def calculate_weighted_average(elements):
    total_score = sum([elem["score"] * elem["num"] for elem in elements.values()])
    total_num = sum([elem["num"] for elem in elements.values()])
    return total_score / total_num

# 计算每个任务类型的 all 值
results = {
    "Understanding": {
        "Single_Page": calculate_weighted_average(data["Understanding"]["Single_Page"]),
        "Multi_Page": calculate_weighted_average(data["Understanding"]["Multi_Page"])
    },
    "Reasoning": {
        "Single_Page": calculate_weighted_average(data["Reasoning"]["Single_Page"]),
        "Multi_Page": calculate_weighted_average(data["Reasoning"]["Multi_Page"])
    },
    "Locating": {
        "Cross_Element": calculate_weighted_average(data["Locating"]["Cross_Element"])
    }
}

# 输出结果（以百分比形式输出，保留一位小数）
for task_type, task_data in results.items():
    for page_type, score in task_data.items():
        print(f"{task_type} ({page_type}) all: {score * 100:.1f}%")


Understanding (Single_Page) all: 95.2%
Understanding (Multi_Page) all: 72.4%
Reasoning (Single_Page) all: 95.5%
Reasoning (Multi_Page) all: 73.4%
Locating (Cross_Element) all: 81.7%


In [21]:
# 打印出task_tag为Understanding，并且multi_page，并且element_type为Layout，recall@5小于0.65的样本

for sample_file in sample_files:
    with open(f"retrieval_results/copali_results_num_workers16_add/{sample_file}", 'r') as file:
        sample = json.load(file)
        if sample["task_tag"] == "Understanding" and sample["recall_at_n"]["5"] < 0.65 and len(sample["evidence_pages"]) > 1 and "Layout" in sample["evidence_sources"]:
            print(sample_file)
            print(sample)
            print("--------------------------------------------------------------")

free_gpt4o_4129981_47_64_9.json
{'question': "Which chapters contain the term 'assessment' in their titles?", 'top_10_images': ['4129981_1.png', '4129981_4.png', '4129981_51.png', '4129981_12.png', '4129981_47.png', '4129981_56.png', '4129981_21.png', '4129981_26.png', '4129981_37.png', '4129981_63.png'], 'recall_at_n': {'5': 0.5, '10': 0.5, '20': 0.5, '30': 1.0}, 'real_positive_image': ['4129981_51', '4129981_52'], 'question_type': 'extract', 'task_tag': 'Understanding', 'evidence_sources': ['Layout'], 'evidence_pages': [52, 53]}
--------------------------------------------------------------
free_gpt4o_4034595_14.json
{'question': "What is the main purpose of the document titled 'Consultation and Engagement With Tribes and Alaska Native Organizations'?", 'top_10_images': ['4034595_11.png', '4034595_48.png', '4034595_33.png', '4034595_8.png', '4034595_15.png', '4034595_9.png', '4034595_20.png', '4034595_10.png', '4034595_47.png', '4034595_21.png'], 'recall_at_n': {'5': 0.5, '10': 1.0, 

In [20]:
sample

{'question': 'The length of strain measuring sensor is obviously shorter than that of temperature measuring sensor, yes or no?',
 'top_10_images': ['4131533_6.png',
  '4131533_37.png',
  '4131533_26.png',
  '4131533_8.png',
  '4131533_16.png',
  '4131533_19.png',
  '4131533_67.png',
  '4131533_23.png',
  '4131533_66.png',
  '4131533_29.png'],
 'recall_at_n': {'5': 1.0, '10': 1.0, '20': 1.0, '30': 1.0},
 'real_positive_image': ['4131533_8'],
 'question_type': 'extract',
 'task_tag': 'Understanding',
 'evidence_sources': ['Text'],
 'evidence_pages': [9]}