In [1]:
def extract_ocr_text(ocr_page):
    text_boxes = []

    for text_region in ocr_page['information']:
        text_boxes +=  text_region['text_list']

    THRESHOLD = 20

    # 1. 计算每个文本框的 Y 坐标平均值
    for box in text_boxes:
        y_coords = [box["position"][1], box["position"][3], box["position"][5], box["position"][7]]
        box["y_avg"] = np.mean(y_coords)  # 计算 Y 坐标的平均值
        box["x_min"] = min(box["position"][0], box["position"][6])  # 计算左上角 X 坐标

    # 2. 按 Y 坐标排序
    text_boxes = sorted(text_boxes, key=lambda box: box["y_avg"])

    # 3. 根据 Y 坐标将文本框分组
    lines = []
    current_line = [text_boxes[0]]

    for box in text_boxes[1:]:
        # 判断是否属于同一行，基于 Y 坐标平均值
        if abs(box["y_avg"] - current_line[-1]["y_avg"]) < THRESHOLD:
            current_line.append(box)
        else:
            # 保存当前行，开启新的一行
            lines.append(current_line)
            current_line = [box]
    lines.append(current_line)  # 添加最后一行

    # 4. 对每一行的文本框按 X 坐标排序，并拼接文本内容
    result = []
    for line in lines:
        sorted_line = sorted(line, key=lambda box: box["x_min"])
        line_text = ''.join([' '.join(box["content"]) for box in sorted_line])
        result.append(line_text)

    return '\n'.join(result)

In [11]:
import pdfplumber
from pathlib import Path
from tqdm import tqdm_notebook
import json
import numpy as np
import string

pdf_folder = Path('/mnt/disk1/goodweather/tbrain_2024/source/競賽資料集/reference/insurance')
pdfs = sorted(pdf_folder.glob('*.pdf'))
ocr_folder = Path('/mnt/disk1/goodweather/tbrain_2024/AdvancedLiterateMachinery/Applications/DocXChain/output/insurance')
ocrs = sorted(ocr_folder.glob('*.json'))

refrence_text_folder = Path('/mnt/disk1/goodweather/tbrain_2024/source/競賽資料集/reference_text') / pdf_folder.name
refrence_text_folder.mkdir(exist_ok=True, parents=True)

for pdf_file, ocr_file in tqdm_notebook(list(zip(pdfs, ocrs))):
    assert pdf_file.stem == ocr_file.stem
    # print(pdf_file.name)
    # if pdf_file.stem != '709':
    #     continue
    text_file = refrence_text_folder / f'{pdf_file.stem}.txt'
    texts = []
    with pdfplumber.open(pdf_file) as pdf, open(ocr_file, 'r') as f:
        ocr_data = json.load(f)
        
        for i, (page, ocr_page) in enumerate(zip(pdf.pages, ocr_data)):
            texts.append(f'**page {i}**')
            select_text = None

            pdf_page_text = page.extract_text(x_tolerance=3, x_tolerance_ratio=None, y_tolerance=3, layout=False, x_density=7.25, y_density=13)
            ocr_page_text = extract_ocr_text(ocr_page)

            pdf_page_text_cleaned = pdf_page_text.translate(str.maketrans('', '', string.punctuation + ' 　\n'))
            ocr_page_text_cleaned = ocr_page_text.translate(str.maketrans('', '', string.punctuation + ' 　\n'))

            # print(pdf_page_text_cleaned)
            # print(ocr_page_text_cleaned)

            assert len(ocr_page_text_cleaned) > 0

            if len(pdf_page_text_cleaned) == 0:
                select_text = ocr_page_text
            elif len(ocr_page_text_cleaned) > 10 and len(pdf_page_text_cleaned) > 10:
                select_text = pdf_page_text
            elif len(pdf_page_text_cleaned) > 10:
                select_text = pdf_page_text
            else:
                print(pdf_file.stem, i)
                print(pdf_page_text_cleaned)
                print(ocr_page_text_cleaned)
            texts.append(select_text)
            texts.append('\n')
    
    with open(text_file, 'w') as f:
        f.write('\n'.join(texts))
            

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for pdf_file, ocr_file in tqdm_notebook(list(zip(pdfs, ocrs))):


  0%|          | 0/643 [00:00<?, ?it/s]

In [22]:
for pid, contents in pid_map_content.items():
    for i, content in enumerate(contents):
        print(content)
        break
    break


{'question': '什麼是跨境手機掃碼支付?', 'answers': ['允許大陸消費者可以用手機支付寶App在台灣實體商店購買商品或服務']}


In [23]:
with open('/mnt/disk1/goodweather/tbrain_2024/source/競賽資料集/reference/faq/pid_map_content.json', 'r') as f:
    pid_map_content = json.load(f)

refrence_text_folder = Path('/mnt/disk1/goodweather/tbrain_2024/source/競賽資料集/reference_text') / 'faq'
refrence_text_folder.mkdir(exist_ok=True, parents=True)

for pid, contents in pid_map_content.items():
    text_file = refrence_text_folder / f'{pid}.txt'
    texts = []
    for i, content in enumerate(contents):
        question = content['question']
        answers = content['answers']
        texts.append(f'**question {i+1}**')
        texts.append(question)
        texts.append(f'**answer {i+1}**')
        texts.append('\n'.join(answers))
        texts.append('\n')
    with open(text_file, 'w') as f:
        f.write('\n'.join(texts))