In [1]:
import os
import pathlib
import json

In [2]:
%%time
pdf_path = "../data/testPDFs/Attention is all you need.pdf"
output_dir = "../data/testPDFOutput"
base_filename = os.path.splitext(os.path.basename(pdf_path))[0]
os.makedirs(output_dir, exist_ok=True)
# Create a specific subfolder for images
images_dir = os.path.join(output_dir, "images")
os.makedirs(images_dir, exist_ok=True)
# Marker images
paper_output_dir = os.path.join(output_dir, f"{base_filename}_marker")
os.makedirs(paper_output_dir, exist_ok=True)

CPU times: total: 0 ns
Wall time: 2.31 s


# PyMuPDF4LLM

In [1]:
import pymupdf4llm
import pymupdf.layout

In [None]:
%%time
print(f"Processing {pdf_path} with Layout-Enhanced PyMuPDF4LLM...")

# --- Convert to Markdown ---
# The import of pymupdf.layout automatically improves column/table detection here.
md_text = pymupdf4llm.to_markdown(
    pdf_path,
    write_images=True,
    image_path=images_dir,  # Where to save extracted figures
    image_format="png"
)

# --- Save Markdown File ---
output_file = os.path.join(output_dir, f"{base_filename}_layout.md")
pathlib.Path(output_file).write_bytes(md_text.encode())

print(f"Saved Markdown to: {output_file}")
print(f"Saved extracted images to: {images_dir}")

# Standard PyMuPDF

In [None]:
import fitz  # Standard PyMuPDF

In [None]:
%%time
print(f"Processing {pdf_path} with standard fitz...")

doc = fitz.open(pdf_path)
full_text = []

# --- Extract Text Page by Page ---
for page_num, page in enumerate(doc):
    # Header for the page
    full_text.append(f"--- Page {page_num + 1} ---")

    # "blocks" helps separate columns better than raw "text"
    # It returns a list of tuples: (x0, y0, x1, y1, "text", block_no, block_type)
    blocks = page.get_text("blocks")

    for block in blocks:
        # block[4] contains the text content
        text_content = block[4]
        full_text.append(text_content)

# --- Save Text File ---
output_file = os.path.join(output_dir, f"{base_filename}_fitz_raw.txt")
with open(output_file, "w", encoding="utf-8") as f:
    f.write("\n".join(full_text))

print(f"Saved raw text to: {output_file}")

# Unstructured

In [None]:
from unstructured.partition.pdf import partition_pdf

In [None]:
%%time
# --- Partition the PDF ---
# strategy="hi_res" is slower but critical for academic 2-column layouts
elements = partition_pdf(
    filename=pdf_path,
    strategy="hi_res",
    infer_table_structure=True
)

# --- Save Option A: Human Readable Text ---
# Filter out headers/footers for cleaner reading
text_content = "\n\n".join([str(el) for el in elements if el.category not in ["Header", "Footer"]])

text_output_path = os.path.join(output_dir, f"{base_filename}_unstructured.txt")
with open(text_output_path, "w", encoding="utf-8") as f:
    f.write(text_content)

# --- Save Option B: Structured JSON (Best for programmatic use) ---
# Convert elements to dictionaries for JSON serialization
elements_json = [el.to_dict() for el in elements]

json_output_path = os.path.join(output_dir, f"{base_filename}_unstructured.json")
with open(json_output_path, "w", encoding="utf-8") as f:
    json.dump(elements_json, f, indent=2)

print(f"Saved Unstructured output to: {output_dir}")

# Marker

In [None]:
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered

In [None]:
%%time
print("Loading Marker models (this may take time on first run)...")

# --- Load Models & Convert ---
converter = PdfConverter(
    artifact_dict=create_model_dict(),
)

print(f"Rendering {pdf_path}...")
rendered = converter(pdf_path)

# Extract text and images
text, _, images = text_from_rendered(rendered)

# --- Save Markdown ---
md_output_path = os.path.join(paper_output_dir, f"{base_filename}.md")
with open(md_output_path, "w", encoding="utf-8") as f:
    f.write(text)

# --- Save Images ---
# Marker extracts images as PIL objects. We save them to the same folder.
for filename, image in images.items():
    image_path = os.path.join(paper_output_dir, filename)
    image.save(image_path)

print(f"Saved Marker output to: {paper_output_dir}")