In [5]:
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser

config = {
    "output_format": "json",
    "ADDITIONAL_KEY": "VALUE"
}
config_parser = ConfigParser(config)

converter = PdfConverter(
    config=config_parser.generate_config_dict(),
    artifact_dict=create_model_dict(),
    processor_list=config_parser.get_processors(),
    renderer=config_parser.get_renderer(),
    llm_service=config_parser.get_llm_service()
)


Loaded layout model s3://layout/2025_02_18 on device cuda with dtype torch.float16
Loaded texify model s3://texify/2025_02_18 on device cuda with dtype torch.float16
Loaded recognition model s3://text_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded table recognition model s3://table_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://inline_math_detection/2025_02_18 on device cuda with dtype torch.float16


In [8]:
rendered = converter(r"/home/eyhyd/contracts_v3/Sample Agreements/Celebrity Contract_3.pdf")

Recognizing layout: 100%|██████████| 3/3 [00:02<00:00,  1.29it/s]
Running OCR Error Detection: 100%|██████████| 2/2 [00:00<00:00, 83.55it/s]
Detecting bboxes: 100%|██████████| 5/5 [00:01<00:00,  3.72it/s]
Recognizing Text: 100%|██████████| 9/9 [00:07<00:00,  1.22it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  6.43it/s]
Recognizing Text: 100%|██████████| 1/1 [00:00<00:00,  3.36it/s]
Recognizing tables: 100%|██████████| 1/1 [00:00<00:00,  6.66it/s]


In [11]:
from pprint import pprint
pprint(rendered)

JSONOutput(children=[JSONBlockOutput(id='/page/0/Page/109', block_type='Page', html="<content-ref src='/page/0/SectionHeader/0'></content-ref><content-ref src='/page/0/Text/1'></content-ref><content-ref src='/page/0/Text/2'></content-ref><content-ref src='/page/0/ListItem/3'></content-ref><content-ref src='/page/0/Text/4'></content-ref><content-ref src='/page/0/ListItem/5'></content-ref><content-ref src='/page/0/Text/6'></content-ref><content-ref src='/page/0/Text/7'></content-ref><content-ref src='/page/0/Text/8'></content-ref><content-ref src='/page/0/Text/9'></content-ref><content-ref src='/page/0/Text/10'></content-ref><content-ref src='/page/0/Text/11'></content-ref><content-ref src='/page/0/Text/12'></content-ref><content-ref src='/page/0/Text/13'></content-ref><content-ref src='/page/0/PageFooter/14'></content-ref><content-ref src='/page/0/Text/15'></content-ref>", polygon=[[0.0, 0.0], [1242.0, 0.0], [1242.0, 1904.0], [0.0, 1904.0]], bbox=[0.0, 0.0, 1242.0, 1904.0], children=[JS

Loaded layout model s3://layout/2025_02_18 on device cuda with dtype torch.float16
Loaded texify model s3://texify/2025_02_18 on device cuda with dtype torch.float16
Loaded recognition model s3://text_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded table recognition model s3://table_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://inline_math_detection/2025_02_18 on device cuda with dtype torch.float16
Converting PDF: /home/eyhyd/contracts_v3/Sample Agreements/Celebrity Contract_3.pdf


Recognizing layout: 100%|██████████| 3/3 [00:01<00:00,  2.67it/s]
Running OCR Error Detection: 100%|██████████| 2/2 [00:00<00:00, 265.73it/s]
Detecting bboxes: 100%|██████████| 5/5 [00:01<00:00,  3.95it/s]
Recognizing Text: 100%|██████████| 9/9 [00:07<00:00,  1.18it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  8.51it/s]
Recognizing Text: 100%|██████████| 1/1 [00:00<00:00,  3.28it/s]
Recognizing tables: 100%|██████████| 1/1 [00:00<00:00,  7.01it/s]


Extracted 2 sections
Results saved to sections_content.json and sections_content.txt


"<content-ref src='/page/0/SectionHeader/0'></content-ref><content-ref src='/page/0/Text/1'></content-ref><content-ref src='/page/0/Text/2'></content-ref><content-ref src='/page/0/ListItem/3'></content-ref><content-ref src='/page/0/Text/4'></content-ref><content-ref src='/page/0/ListItem/5'></content-ref><content-ref src='/page/0/Text/6'></content-ref><content-ref src='/page/0/Text/7'></content-ref><content-ref src='/page/0/Text/8'></content-ref><content-ref src='/page/0/Text/9'></content-ref><content-ref src='/page/0/Text/10'></content-ref><content-ref src='/page/0/Text/11'></content-ref><content-ref src='/page/0/Text/12'></content-ref><content-ref src='/page/0/Text/13'></content-ref><content-ref src='/page/0/PageFooter/14'></content-ref><content-ref src='/page/0/Text/15'></content-ref>"

In [37]:
import gc
import os
from pathlib import Path
import torch
from typing import Dict, Any, List
import logging
from backend.backend.Doc_Processor.processors.base_processor import BaseProcessor
from tqdm.auto import tqdm

# Import marker libraries
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered

import warnings
warnings.filterwarnings("ignore")

print("Torch CUDA available: ", torch.cuda.is_available())

logger = logging.getLogger(__name__)


class PDFProcessor(BaseProcessor):
    def __init__(self, config: Dict[str, Any] = None):
        super().__init__(config)
        self.max_workers = min(32, (os.cpu_count() or 1) + 4)
        self.chunk_size = config.get("chunk_size", 10)  # Process pages in chunks
        self.save_processed_files = config.get("save_processed_files", True)
        self.save_processed_files_dir = config.get("save_processed_files_dir", "processed_files")
        
        # Initialize marker PDF converter
        self.converter = PdfConverter(
            artifact_dict=create_model_dict(),
        )

    def _validate_config(self) -> None:
        """Validate processor configuration with extended checks."""
        required_keys = ["language"]
        if not all(key in self.config for key in required_keys):
            raise ValueError(f"Missing required config keys: {required_keys}")

    def process(self, file_path: Path) -> Dict[str, Any]:
        try:
            print("Processing PDF file:", file_path)
            logger.info(f"Processing PDF file: {file_path}")
            
            # Use marker to convert PDF
            rendered = self.converter(str(file_path))
            text, _,images = text_from_rendered(rendered)
            
            # Create page content structure similar to original format
            pages_content = []
            
            # Split text by page (assuming double newlines separate pages)
            text_pages = text.split("\n\n")
            
            for page_num, page_text in enumerate(text_pages):
                try:
                    page_content = self._create_page_content(page_text.strip(), "marker", page_num)
                    pages_content.append(page_content)
                except Exception as e:
                    logger.error(f"Page {page_num} failed: {str(e)}")
                    pages_content.append(self._create_error_page(page_num, str(e)))
            
            # Save content if configured
            if self.save_processed_files:
                self._save_content({"content": pages_content}, self.save_processed_files_dir, file_path.stem)
            
            return {"content": pages_content, "metadata": self._get_metadata(file_path, len(text_pages)), "images": images}
        except Exception as e:
            logger.error(f"PDF processing failed: {str(e)}")
            return {"content": [], "metadata": self._get_metadata(file_path, 0), "error": str(e)}
        finally:
            gc.collect()
                
    def _save_content(self, content: Dict[str, Any], output_dir: str, file_name: str) -> None:
        # Save the text into a text file
        output_dir_path = Path(output_dir)
        output_dir_path.mkdir(parents=True, exist_ok=True)
        
        text_file = output_dir_path / f"{file_name}.txt"
        logger.info(f"Saving text to: {text_file}")
        
        with open(text_file, "w") as f:
            for page in content["content"]:
                f.write(page["text"])
                f.write("\n\n")
        
        print(f"Text saved to: {text_file}")

    def _create_page_content(self, text: str, source: str, page_num: int) -> Dict[str, Any]:
        return {
            "text": text,
            "source": source,
            "page": page_num,
        }

    def _create_error_page(self, page_num: int, error: str) -> Dict[str, Any]:
        return {"text": "", "error": error, "page": page_num}
        
    def _get_metadata(self, file_path: Path, page_count: int) -> Dict[str, Any]:
        return {
            "pages": page_count,
            "format": "PDF",
            "filename": file_path.name,
            "file_path": str(file_path.absolute()),
        }

Torch CUDA available:  True


In [38]:
pdf_path = Path(r"/home/eyhyd/contracts_v3/Sample Agreements/Celebrity Contract_3.pdf")
processor = PDFProcessor(config={"language": "en"})
output = processor.process(pdf_path)

Loaded layout model s3://layout/2025_02_18 on device cuda with dtype torch.float16
Loaded texify model s3://texify/2025_02_18 on device cuda with dtype torch.float16
Loaded recognition model s3://text_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded table recognition model s3://table_recognition/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_18 on device cuda with dtype torch.float16
Loaded detection model s3://inline_math_detection/2025_02_18 on device cuda with dtype torch.float16


INFO:__main__:Processing PDF file: /home/eyhyd/contracts_v3/Sample Agreements/Celebrity Contract_3.pdf


Processing PDF file: /home/eyhyd/contracts_v3/Sample Agreements/Celebrity Contract_3.pdf


Recognizing layout: 100%|██████████| 3/3 [00:00<00:00,  3.95it/s]
Running OCR Error Detection: 100%|██████████| 2/2 [00:00<00:00, 276.89it/s]
Detecting bboxes: 100%|██████████| 5/5 [00:01<00:00,  3.94it/s]
Recognizing Text: 100%|██████████| 9/9 [00:07<00:00,  1.19it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  8.54it/s]
Recognizing Text: 100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
Recognizing tables: 100%|██████████| 1/1 [00:00<00:00,  6.96it/s]
INFO:__main__:Saving text to: processed_files/Celebrity Contract_3.txt


Text saved to: processed_files/Celebrity Contract_3.txt


In [36]:
output

{'content': [],
 'metadata': {'pages': 0,
  'format': 'PDF',
  'filename': 'Celebrity Contract_3.pdf',
  'file_path': '/home/eyhyd/contracts_v3/Sample Agreements/Celebrity Contract_3.pdf'},
 'error': 'too many values to unpack (expected 2)'}