### description mining mulit agent 시스템 개발을 위한 예제

#### papers\argyrodite 폴더에 있는 PDF 파일들을 pymupdf와 pymupdf4llm으로 파싱하고, 결과물을 html로 비교하기

In [1]:
import fitz  # PyMuPDF
import pymupdf4llm
import os
import html
import traceback

input_dir = 'papers/argyrodite'
output_dir = 'comparison_results'
os.makedirs(output_dir, exist_ok=True)

html_template_start = """
<!DOCTYPE html>
<html>
<head>
<style>
    body {{ font-family: Arial, sans-serif; margin: 0; padding: 20px; background: #f4f4f4; }}
    .container {{ display: flex; gap: 20px; height: 95vh; }}
    .column {{ 
        flex: 1; 
        background: white; 
        padding: 20px; 
        border-radius: 8px; 
        box-shadow: 0 2px 5px rgba(0,0,0,0.1); 
        overflow-y: scroll; 
        white-space: pre-wrap;
    }}
    h2 {{ position: sticky; top: 0; background: white; padding: 10px 0; border-bottom: 2px solid #ddd; margin-top: 0; }}
    .raw-text {{ font-family: "Courier New", monospace; font-size: 12px; color: #333; }}
    .markdown-text {{ font-family: "Segoe UI", sans-serif; font-size: 14px; color: #000; }}
</style>
</head>
<body>
<h1>Comparison: {{filename}}</h1>
<div class="container">
    <div class="column raw-text">
        <h2>PyMuPDF (Raw Text)</h2>
        {{content_raw}}
    </div>
    <div class="column markdown-text">
        <h2>PyMuPDF4LLM (Markdown)</h2>
        {{content_llm}}
    </div>
</div>
</body>
</html>
"""

for filename in os.listdir(input_dir):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(input_dir, filename)
        
        try:
            doc = fitz.open(pdf_path)
            raw_text = ""
            for page in doc:
                raw_text += page.get_text()
            doc.close()
            
            llm_text = pymupdf4llm.to_markdown(
                pdf_path,
                show_progress=False
            )
            
            safe_raw = html.escape(raw_text)
            safe_llm = html.escape(llm_text)
            
            final_html = html_template_start.format(
                filename=filename, 
                content_raw=safe_raw, 
                content_llm=safe_llm
            )
            
            save_name = filename.replace('.pdf', '_compare.html')
            save_path = os.path.join(output_dir, save_name)
            
            with open(save_path, "w", encoding="utf-8") as f:
                f.write(final_html)
                
            print(f"생성 완료: {save_path}")
            
        except Exception as e:
            print(f"에러 발생 ({filename}): {e}")
            print("Full traceback:")
            traceback.print_exc()
            print("-" * 50)

print("모든 작업이 완료되었습니다.")


Consider using the pymupdf_layout package for a greatly improved page layout analysis.
생성 완료: comparison_results\10.1002pssa.201001117_compare.html
생성 완료: comparison_results\10.1002zaac.201000121_compare.html
생성 완료: comparison_results\10.1016j.jssc.2016.12.001_compare.html
생성 완료: comparison_results\10.1016j.ssi.2012.06.008_compare.html
생성 완료: comparison_results\10.1021acs.chemmater.1c00157_compare.html
생성 완료: comparison_results\10.1021acs.inorgchem.8b02443_compare.html
생성 완료: comparison_results\10.1021acsomega.8b00377_compare.html
생성 완료: comparison_results\10.1021cm300792t_compare.html
생성 완료: comparison_results\10.1021jacs.7b06327_compare.html
생성 완료: comparison_results\10.1021jacs.8b10282_compare.html
생성 완료: comparison_results\10.1039c7ta08581h_compare.html
모든 작업이 완료되었습니다.
