In [1]:
import os
from dotenv import load_dotenv

llama_api_key = os.getenv("LLAMA_CLOUD_API_KEY")

if llama_api_key:
    print(f"LlamaCloud API key loaded: {llama_api_key[:10]}...")
else:
    print("Warning: LLAMA_CLOUD_API_KEY not found in environment variables")
    print("Please set it before running parsing examples")

LlamaCloud API key loaded: llx-10Js2i...


In [2]:
from llama_parse import LlamaParse

import json
import asyncio
from pathlib import Path
from typing import List, Dict, Any

import nest_asyncio
nest_asyncio.apply()

print("Imports successful!")
from importlib.metadata import version

print(version("llama-parse"))

Imports successful!
0.4.4


In [3]:
SAMPLE_DIR = Path("./sample_documents")
OUTPUT_DIR = Path("./llamaparse_output")

OUTPUT_DIR.mkdir(exist_ok=True)

if SAMPLE_DIR.exists():
    print("Available sample documents:")
    for file in SAMPLE_DIR.iterdir():
        if file.is_file():
            print(f"   - {file.name} {file.stat().st_size/ 1024:.1f} KB")
else:
    print(f"Sample directory not found: {SAMPLE_DIR}")
    print("Creting sample directory...")
    SAMPLE_DIR.mkdir(exist_ok=True)

Available sample documents:
   - attention_paper.pdf 2163.3 KB
   - docling_paper.pdf 5436.1 KB
   - sample.docx 11.9 KB
   - sample.html 3.9 KB
   - sample.md 3.1 KB
   - sample.xlsx 14.9 KB
   - sample1.docx 36.1 KB
   - sample1.pptx 28.5 KB
   - sample1.xlsx 4.9 KB
   - sample_text_image.png 10.9 KB
   - scansmpl.pdf 21.0 KB


In [4]:
import urllib.request

sample_pdf_url = "https://arxiv.org/pdf/1706.03762.pdf"
sample_pdf_path = SAMPLE_DIR / "attention_paper.pdf"

if not sample_pdf_path.exists():
    try:
        urllib.request.urlretrieve(sample_pdf_url, sample_pdf_path)
        print(f"Downloaded: {sample_pdf_path}")
        print(f"File size: {sample_pdf_path.stat().st_size/1024:.1f} KB")
    except Exception as e:
        print(f"Download failed: {e}")
        print("You can manually download a PDF for testing")
else:
    print(f"Sample PDF already exists: {sample_pdf_path}")

Sample PDF already exists: sample_documents\attention_paper.pdf


In [5]:
parse_basic = LlamaParse(
    result_type="markdown"
)
print("LlamaParse Configuration Parameters:")
print("="*50)

# Key parameters categorized
config_categories = {
    "Authentication": ["api_key", "base_url"],
    "Output Control": ["result_type", "language", "split_by_page"],
    "Mode Selection": ["fast_mode", "premium_mode", "auto_mode", "preset"],
    "OCR & Images": ["disable_ocr", "high_res_ocr", "extract_charts", "take_screenshot"],
    "Layout": ["extract_layout", "do_not_unroll_columns"],
    "Structured Output": ["structured_output", "structured_output_json_schema"],
    "Prompts": ["parsing_instruction", "system_prompt", "formatting_instruction"],
    "Performance": ["num_workers", "job_timeout_in_seconds", "verbose"],
}

for category, params in config_categories.items():
    print(f"\n{category}:")
    for param in params:
        print(f"  - {param}")

LlamaParse Configuration Parameters:

Authentication:
  - api_key
  - base_url

Output Control:
  - result_type
  - language
  - split_by_page

Mode Selection:
  - fast_mode
  - premium_mode
  - auto_mode
  - preset

OCR & Images:
  - disable_ocr
  - high_res_ocr
  - extract_charts
  - take_screenshot

Layout:
  - extract_layout
  - do_not_unroll_columns

Structured Output:
  - structured_output
  - structured_output_json_schema

Prompts:
  - parsing_instruction
  - system_prompt
  - formatting_instruction

Performance:
  - num_workers
  - job_timeout_in_seconds
  - verbose


In [6]:
parser_markdown = LlamaParse(
    result_type="markdown",
    verbose=True
)

parser_text = LlamaParse(
    result_type="text",
    verbose=True
)

print("Markdown Parser Configuration:")
print(f"  Result Type: {parser_markdown.result_type}")

print("\nText Parser Configuration:")
print(f"  Result Type: {parser_text.result_type}")

Markdown Parser Configuration:
  Result Type: ResultType.MD

Text Parser Configuration:
  Result Type: ResultType.TXT


In [8]:
job_result_methods = {
    "get_markdown_documents(split_by_page)": "Get LlamaIndex Documents with markdown content",
    "get_text_documents(split_by_page)": "Get LlamaIndex Documents with text content",
    "get_image_documents()": "Get extracted images as documents",
    "aget_image_documents()": "Async version with download capabilities",
    "get_json()": "Get structured JSON output",
    ".pages": "Direct access to page-by-page data",
    ".text": "Full document as plain text",
    ".md": "Full document as markdown",
}

print("JobResult Access Methods:")
print("="*60)
for method, description in job_result_methods.items():
    print(f"\n{method}")
    print(f"  → {description}")

JobResult Access Methods:

get_markdown_documents(split_by_page)
  → Get LlamaIndex Documents with markdown content

get_text_documents(split_by_page)
  → Get LlamaIndex Documents with text content

get_image_documents()
  → Get extracted images as documents

aget_image_documents()
  → Async version with download capabilities

get_json()
  → Get structured JSON output

.pages
  → Direct access to page-by-page data

.text
  → Full document as plain text

.md
  → Full document as markdown


In [9]:
def sync_parse_example(file_path: str):
    parser = LlamaParse(result_type="markdown")
    documents = parser.load_data(file_path)
    return documents

async def async_parse_example(file_path: str):
    parser = LlamaParse(result_type="markdown")
    documents = await parser.aload_data(file_path)
    return documents

print("Sync and Async patterns defined.")
print("\nSync: Use load_data(), parse() for simple scripts")
print("Async: Use aload_data(), aparse() for concurrent processing")

Sync and Async patterns defined.

Sync: Use load_data(), parse() for simple scripts
Async: Use aload_data(), aparse() for concurrent processing


In [10]:
parser = LlamaParse(
    result_type="markdown",
    verbose=True,
)
pdf_path = str(SAMPLE_DIR / "attention_paper.pdf")

if Path(pdf_path).exists():
    print(f"Parsing: {pdf_path}")
    documents = parser.load_data(pdf_path)

    print(f"\nParsing complete!")
    print(f"Number of documents returned: {len(documents)}")

    if documents:
        doc = documents[0]
        print(f"Document type: {type(doc).__name__}")
        print(f"Content length: {len(doc.text)} charactors")
        print(f"\nFirst 1000 charactors:")
        print("="*60)
        print(doc.text[:1000])
else:
    print(f"File not found: {pdf_path}")
    print("Please run the download cell first or provide your own PDF.")

Parsing: sample_documents\attention_paper.pdf
Started parsing the file under job_id 1bcf6a2f-2a2c-463b-8bdd-b750a1e2b4d8

Parsing complete!
Number of documents returned: 1
Document type: Document
Content length: 44337 charactors

First 1000 charactors:
arXiv:1706.03762v7 [cs.CL] 2 Aug 2023

# Attention Is All You Need

Ashish Vaswani∗  Noam Shazeer∗        Niki Parmar∗  Jakob Uszkoreit∗

Google Brain         Google Brain     Google Research    Google Research

avaswani@google.com    noam@google.com    nikip@google.com    usz@google.com

Llion Jones∗     Aidan N. Gomez∗ †         Łukasz Kaiser∗

Google Research    University of Toronto          Google Brain

llion@google.com    aidan@cs.toronto.edu    lukaszkaiser@google.com

Illia Polosukhin∗ ‡

illia.polosukhin@gmail.com

# Abstract

The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and dec

In [11]:
parser_text = LlamaParse(
    result_type="text",  # Plain text output (no formatting)
    verbose=True,
)

pdf_path = str(SAMPLE_DIR / "attention_paper.pdf")

if Path(pdf_path).exists():
    print("Parsing with TEXT output...")
    documents_text = parser_text.load_data(pdf_path)
    
    if documents_text:
        print(f"\nText output preview (first 800 chars):")
        print("=" * 50)
        print(documents_text[0].text[:800])
else:
    print(f"File not found: {pdf_path}")

Parsing with TEXT output...
Started parsing the file under job_id c42fd231-9e08-4984-a764-05fa8a7a7b15

Text output preview (first 800 chars):
    arXiv:1706.03762v7 [cs.CL] 2 Aug 2023

  Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
    scholarly works.

    Attention Is All You Need

    Ashish Vaswani∗  Noam Shazeer∗        Niki Parmar∗  Jakob Uszkoreit∗
    Google Brain         Google Brain     Google Research    Google Research
avaswani@google.com    noam@google.com    nikip@google.com    usz@google.com

    Llion Jones∗     Aidan N. Gomez∗ †         Łukasz Kaiser∗
Google Research    University of Toronto          Google Brain
llion@google.com    aidan@cs.toronto.edu    lukaszkaiser@google.com

                     Illia Polosukhin∗ ‡
                     illia.polosukhin@gmail.com

                                 


In [13]:
# parser = LlamaParse(
#     result_type="markdown",
#     verbose=True,
# )

# pdf_path = str(SAMPLE_DIR / "attention_paper.pdf")

# if Path(pdf_path).exists():
#     print("Using parse() method for detailed results...")

#     job_results = parser.parse([pdf_path])

#     if job_results:
#         result = job_results[0]
#         print(f"\nJobResult type: {type(result).__name__}")
#         print(f"Number of pages: {len(result.pages)}")

#         print(f"\nPage 1 content (first 500 chars):")
#         print("=" * 50)
#         if result.pages:
#             page1 = result.pages[0]
#             print(page1.md[:500] if page1.md else page1.text[:500])
#     else:
#         print(f"File not found: {pdf_path}")


In [18]:
parser_text = LlamaParse(
    result_type="markdown", split_by_page=True
)

pdf_path = str(SAMPLE_DIR / "attention_paper.pdf")

if Path(pdf_path).exists():
    document = parser_text.load_data(pdf_path)
    
    print(f"Total documents (pages): {len(documents)}")
    print("="*50)

    for i, doc in enumerate(document[:5]):
        print(f"\nDocuments {i+1} (Page {i+1}):")
        print(f"   Length: {len(doc.text)} chars")
        print(f"   Metadata: {doc.metadata}")
        preview = doc.text[:150].replace('\n', ' ')
        print(f"   Preview: {preview}...")
else:
    print(f"File not found: {pdf_path}")

Started parsing the file under job_id 2667f2f6-4c35-4590-a216-87747a5de911
Total documents (pages): 1

Documents 1 (Page 1):
   Length: 44337 chars
   Metadata: {}
   Preview: arXiv:1706.03762v7 [cs.CL] 2 Aug 2023  # Attention Is All You Need  Ashish Vaswani∗  Noam Shazeer∗        Niki Parmar∗  Jakob Uszkoreit∗  Google Brain...


In [17]:
parser = LlamaParse(
    result_type="markdown",
    split_by_page=False,      # Single document output
    page_separator="\n\n---PAGE BREAK---\n\n",  # Custom separator between pages
    page_prefix="[Page Start]",   # Prefix for each page
    page_suffix="[Page End]",     # Suffix for each page
    verbose=False,
)

pdf_path = str(SAMPLE_DIR / "attention_paper.pdf")

if Path(pdf_path).exists():
    documents = parser.load_data(pdf_path)
    
    if documents:
        # Find page separators in output
        content = documents[0].text
        
        print("Custom page formatting applied:")
        print("=" * 50)
        
        # Show a section with page break
        if "---PAGE BREAK---" in content:
            # Find first page break
            break_pos = content.find("---PAGE BREAK---")
            start = max(0, break_pos - 200)
            end = min(len(content), break_pos + 250)
            print(content[start:end])
        else:
            print("Page separators may not be visible in output")
            print(f"\nFirst 500 chars:")
            print(content[:500])
else:
    print(f"File not found: {pdf_path}")

Custom page formatting applied:
Page separators may not be visible in output

First 500 chars:
arXiv:1706.03762v7 [cs.CL] 2 Aug 2023

# Attention Is All You Need

Ashish Vaswani∗  Noam Shazeer∗        Niki Parmar∗  Jakob Uszkoreit∗

Google Brain         Google Brain     Google Research    Google Research

avaswani@google.com    noam@google.com    nikip@google.com    usz@google.com

Llion Jones∗     Aidan N. Gomez∗ †         Łukasz Kaiser∗

Google Research    University of Toronto          Google Brain

llion@google.com    aidan@cs.toronto.edu    lukaszkaiser@google.com

Illia Polosukhin∗ 


In [19]:
parser_fast = LlamaParse(
    preset="fast",  # Quick extraction without OCR
    result_type="text",
    verbose=True,
)

pdf_path = str(SAMPLE_DIR / "attention_paper.pdf")

if Path(pdf_path).exists():
    print("Parsing with FAST preset...")
    print("(Best for: digital PDFs with selectable text)")
    print("-" * 50)
    
    import time
    start = time.time()
    documents = parser_fast.load_data(pdf_path)
    elapsed = time.time() - start
    
    print(f"\nParsing time: {elapsed:.2f} seconds")
    print(f"Content length: {len(documents[0].text)} chars")
    print(f"\nPreview:")
    print(documents[0].text[:500])
else:
    print(f"File not found: {pdf_path}")

Parsing with FAST preset...
(Best for: digital PDFs with selectable text)
--------------------------------------------------
Started parsing the file under job_id 5674f2aa-6eca-43a8-995f-f6166f852c3e

Parsing time: 10.67 seconds
Content length: 46157 chars

Preview:
    arXiv:1706.03762v7 [cs.CL] 2 Aug 2023

  Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
    scholarly works.

    Attention Is All You Need

    Ashish Vaswani∗  Noam Shazeer∗        Niki Parmar∗  Jakob Uszkoreit∗
    Google Brain         Google Brain     Google Research    Google Research
avaswani@google.com    noam@google.com    nikip@google.com    usz@google.com

    Llion Jones∗ 


In [20]:

# Premium preset - best quality parsing

parser_premium = LlamaParse(
    preset="premium",  # Best quality, uses advanced models
    result_type="markdown",
    verbose=True,
)

pdf_path = str(SAMPLE_DIR / "attention_paper.pdf")

if Path(pdf_path).exists():
    print("Parsing with PREMIUM preset...")
    print("(Best for: complex layouts, scanned documents)")
    print("-" * 50)
    
    import time
    start = time.time()
    documents = parser_premium.load_data(pdf_path)
    elapsed = time.time() - start
    
    print(f"\nParsing time: {elapsed:.2f} seconds")
    print(f"Content length: {len(documents[0].text)} chars")
    print(f"\nPreview:")
    print(documents[0].text[:500])
else:
    print(f"File not found: {pdf_path}")

Parsing with PREMIUM preset...
(Best for: complex layouts, scanned documents)
--------------------------------------------------
Started parsing the file under job_id 151c810c-84da-4e6f-8231-3ce26e952f60

Parsing time: 9.06 seconds
Content length: 44337 chars

Preview:
arXiv:1706.03762v7 [cs.CL] 2 Aug 2023

# Attention Is All You Need

Ashish Vaswani∗  Noam Shazeer∗        Niki Parmar∗  Jakob Uszkoreit∗

Google Brain         Google Brain     Google Research    Google Research

avaswani@google.com    noam@google.com    nikip@google.com    usz@google.com

Llion Jones∗     Aidan N. Gomez∗ †         Łukasz Kaiser∗

Google Research    University of Toronto          Google Brain

llion@google.com    aidan@cs.toronto.edu    lukaszkaiser@google.com

Illia Polosukhin∗ 


In [21]:
# Scientific preset - optimized for research papers

parser_scientific = LlamaParse(
    preset="scientific",  # Handles LaTeX, equations, citations
    result_type="markdown",
    verbose=True,
)

pdf_path = str(SAMPLE_DIR / "attention_paper.pdf")

if Path(pdf_path).exists():
    print("Parsing with SCIENTIFIC preset...")
    print("(Best for: academic papers, equations, citations)")
    print("-" * 50)
    
    import time
    start = time.time()
    documents = parser_scientific.load_data(pdf_path)
    elapsed = time.time() - start
    
    print(f"\nParsing time: {elapsed:.2f} seconds")
    print(f"Content length: {len(documents[0].text)} chars")
    print(f"\nPreview (look for equations and formatting):")
    print(documents[0].text[:800])
else:
    print(f"File not found: {pdf_path}")

Parsing with SCIENTIFIC preset...
(Best for: academic papers, equations, citations)
--------------------------------------------------
Started parsing the file under job_id 059b0c2b-b85c-41cb-8788-15e87539ea3f

Parsing time: 9.35 seconds
Content length: 44337 chars

Preview (look for equations and formatting):
arXiv:1706.03762v7 [cs.CL] 2 Aug 2023

# Attention Is All You Need

Ashish Vaswani∗  Noam Shazeer∗        Niki Parmar∗  Jakob Uszkoreit∗

Google Brain         Google Brain     Google Research    Google Research

avaswani@google.com    noam@google.com    nikip@google.com    usz@google.com

Llion Jones∗     Aidan N. Gomez∗ †         Łukasz Kaiser∗

Google Research    University of Toronto          Google Brain

llion@google.com    aidan@cs.toronto.edu    lukaszkaiser@google.com

Illia Polosukhin∗ ‡

illia.polosukhin@gmail.com

# Abstract

The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. 

In [22]:
parser_invoice = LlamaParse(
    preset="invoice",  # Handles invoices, receipts
    result_type="markdown",
    verbose=True,
)

# Note: For best results, use this with actual invoice documents
# This example shows the configuration

print("Invoice Preset Configuration:")
print("=" * 50)
print("Best for: Invoices, receipts, financial documents")
print("Features:")
print("  - Optimized for tabular data (line items)")
print("  - Extracts dates, amounts, totals")
print("  - Handles various invoice formats")
print("\nUsage:")
print('  parser = LlamaParse(preset="invoice")')
print('  docs = parser.load_data("invoice.pdf")')


Invoice Preset Configuration:
Best for: Invoices, receipts, financial documents
Features:
  - Optimized for tabular data (line items)
  - Extracts dates, amounts, totals
  - Handles various invoice formats

Usage:
  parser = LlamaParse(preset="invoice")
  docs = parser.load_data("invoice.pdf")


In [23]:
# Slides preset - optimized for presentations

parser_slides = LlamaParse(
    preset="slides",  # Optimized for PowerPoint/presentations
    result_type="markdown",
    verbose=True,
)

# Note: Use with PPTX files for best results

print("Slides Preset Configuration:")
print("=" * 50)
print("Best for: PowerPoint, Google Slides, Keynote")
print("Features:")
print("  - Preserves slide structure")
print("  - Extracts speaker notes")
print("  - Handles diagrams and charts")
print("  - Maintains bullet point hierarchy")
print("\nUsage:")
print('  parser = LlamaParse(preset="slides")')
print('  docs = parser.load_data("presentation.pptx")')

Slides Preset Configuration:
Best for: PowerPoint, Google Slides, Keynote
Features:
  - Preserves slide structure
  - Extracts speaker notes
  - Handles diagrams and charts
  - Maintains bullet point hierarchy

Usage:
  parser = LlamaParse(preset="slides")
  docs = parser.load_data("presentation.pptx")


In [None]:
import time
presets_to_compare = ["fast","balanced", "premium"]
result = {}

pdf_path = str(SAMPLE_DIR / "attention_paper.pdf")

if Path(pdf_path).exists():
    print("Comparing present on the same documents...")
    print("=" * 60)

    for preset in presets_to_compare:
        print(f"\nParsing with '{preset}' preset...")

        parser = LlamaParse(
            preset=preset,
            result_type="text",
            verbose=False,
        )
        start = time.time()
        docs = parser.load_data(pdf_path)
        elapsed = time.time() - start
        print(docs)
        result[preset] = {
            "time":elapsed,
            "length":len(docs[0].text),
            "preview":docs[0].text[:200]
        }
        print(f"   Time: {elapsed:.2f}s, Content:{len(docs[0].text)}")
    print("="*60)
    print("\nSummary of preset comparison:")
    print("=" * 60)
    print(f"{'Preset':<12} {'Time (s)':<12} {'Content Length':<15}")
    print('-'*40)
    for preset, data in result.items():
        print(f"{preset:<12} {data['time']:<12.2f} {data['length']:<15}")
else:
    print(f"File not found: {pdf_path}")



Comparing present on the same documents...

Parsing with 'fast' preset...
[Document(id_='c36bca62-67e6-4d62-901b-04f7ccb996ab', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='    arXiv:1706.03762v7 [cs.CL] 2 Aug 2023\n\n  Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\n    scholarly works.\n\n    Attention Is All You Need\n\n    Ashish Vaswani∗  Noam Shazeer∗        Niki Parmar∗  Jakob Uszkoreit∗\n    Google Brain         Google Brain     Google Research    Google Research\navaswani@google.com    noam@google.com    nikip@google.com    usz@google.com\n\n    Llion Jones∗     Aidan N. Gomez∗ †         Łukasz Kaiser∗\nGoogle Research    University of Toronto          Google Brain\nllion@google.com    aidan@cs.toronto.edu    lukaszkaiser@google.com\n\n                     Illia Polosukhin∗ ‡\n                    

In [32]:
# Fast mode - skips OCR for faster processing

parser_fast = LlamaParse(
    fast_mode=True,  # Skip OCR, extract text only
    result_type="text",
    verbose=True,
)

pdf_path = str(SAMPLE_DIR / "attention_paper.pdf")

if Path(pdf_path).exists():
    print("FAST MODE Parsing")
    print("=" * 50)
    print("Characteristics:")
    print("  - No OCR processing")
    print("  - Extracts only embedded/selectable text")
    print("  - Fastest processing time")
    print("  - Best for: Digital PDFs with selectable text")
    print("-" * 50)
    
    import time
    start = time.time()
    documents = parser_fast.load_data(pdf_path)
    elapsed = time.time() - start
    
    print(f"\nTime: {elapsed:.2f}s")
    print(f"Content: {len(documents[0].text)} chars")
else:
    print(f"File not found: {pdf_path}")

FAST MODE Parsing
Characteristics:
  - No OCR processing
  - Extracts only embedded/selectable text
  - Fastest processing time
  - Best for: Digital PDFs with selectable text
--------------------------------------------------
Started parsing the file under job_id 6fb54617-954d-45bf-819d-ec23f0c90a8b

Time: 14.15s
Content: 46157 chars


In [33]:
parser_premium = LlamaParse(
    premium_mode=True,  # Use best available parser
    result_type="text",
    verbose=True,
)

pdf_path = str(SAMPLE_DIR / "attention_paper.pdf")

if Path(pdf_path).exists():
    print("PREMIUM MODE Parsing")
    print("=" * 50)
    print("Characteristics:")
    print("  - Uses advanced AI models")
    print("  - Better table extraction")
    print("  - Improved layout understanding")
    print("  - Best for: Complex documents, financial reports")
    print("-" * 50)
    
    import time
    start = time.time()
    documents = parser_premium.load_data(pdf_path)
    elapsed = time.time() - start
    
    print(f"\nTime: {elapsed:.2f}s")
    print(f"Content: {len(documents[0].text)} chars")
else:
    print(f"File not found: {pdf_path}")

PREMIUM MODE Parsing
Characteristics:
  - Uses advanced AI models
  - Better table extraction
  - Improved layout understanding
  - Best for: Complex documents, financial reports
--------------------------------------------------
Started parsing the file under job_id ceb9d559-68c6-4aff-a9a7-8355db0bc296

Time: 8.99s
Content: 46157 chars


In [34]:
parser_auto = LlamaParse(
    auto_mode=True,  # Enable automatic mode selection
    
    # Trigger conditions for upgrading to premium parsing
    auto_mode_trigger_on_image_in_page=True,   # Upgrade pages with images
    auto_mode_trigger_on_table_in_page=True,   # Upgrade pages with tables
    
    result_type="text",
    verbose=True,
)

pdf_path = str(SAMPLE_DIR / "attention_paper.pdf")

if Path(pdf_path).exists():
    print("AUTO MODE with Triggers")
    print("=" * 50)
    print("Configuration:")
    print("  - auto_mode: True")
    print("  - Trigger on images: True")
    print("  - Trigger on tables: True")
    print("\nBehavior:")
    print("  - Simple text pages → Fast parsing")
    print("  - Pages with images/tables → Premium parsing")
    print("-" * 50)
    
    import time
    start = time.time()
    documents = parser_auto.load_data(pdf_path)
    elapsed = time.time() - start
    
    print(f"\nTime: {elapsed:.2f}s")
    print(f"Content: {len(documents[0].text)} chars")
else:
    print(f"File not found: {pdf_path}")

AUTO MODE with Triggers
Configuration:
  - auto_mode: True
  - Trigger on images: True
  - Trigger on tables: True

Behavior:
  - Simple text pages → Fast parsing
  - Pages with images/tables → Premium parsing
--------------------------------------------------
Started parsing the file under job_id 8f56cef8-7380-4b1f-aca0-5e568b96897c

Time: 9.00s
Content: 46157 chars
