In [None]:
import os
import json
from tqdm import tqdm

from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod

base_path = './data/附件3/'

reader1 = FileBasedDataReader("")

for file_name in tqdm(os.listdir(base_path), desc="transforming pdf"):
    # 原始代码的逻辑（处理每个PDF文件）
    pdf_file_name = os.path.join(base_path, file_name)
    name_without_suff = os.path.basename(pdf_file_name).split(".")[0]

    # 准备环境变量和路径
    local_image_dir = f"./data/transform/{name_without_suff}/image/"
    local_md_dir = f"./data/transform/{name_without_suff}/md/"
    os.makedirs(local_image_dir, exist_ok=True)
    os.makedirs(local_md_dir, exist_ok=True)

    image_writer = FileBasedDataWriter(local_image_dir)
    md_writer = FileBasedDataWriter(local_md_dir)

    # 读取PDF内容并处理
    # reader1 = FileBasedDataReader("")
    pdf_bytes = reader1.read(pdf_file_name)
    ds = PymuDocDataset(pdf_bytes)

    if ds.classify() == SupportedPdfParseMethod.OCR:
        infer_result = ds.apply(doc_analyze, ocr=True)
        pipe_result = infer_result.pipe_ocr_mode(image_writer)
    else:
        infer_result = ds.apply(doc_analyze, ocr=False)
        pipe_result = infer_result.pipe_txt_mode(image_writer)

    # 生成中间结果和文件
    # infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
    model_inference_result = infer_result.get_infer_res()
    # pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
    # pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
    md_content = pipe_result.get_markdown(local_image_dir)
    pipe_result.dump_md(md_writer, f"{name_without_suff}.md", local_image_dir)
    # content_list_content = pipe_result.get_content_list(local_image_dir)
    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", local_image_dir)