In [1]:
import io
import pypdfium2

from PIL import Image
from surya.detection import batch_text_detection
from surya.layout import batch_layout_detection

from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
from surya.model.layout.model import load_model as load_layout_model
from surya.model.layout.processor import load_processor as load_layout_processor
from surya.settings import Settings

input_file = "inputs/EngineeringatHKUACenturyofExcellence.pdf"
file_name = "EngineeringatHKUACenturyofExcellence"

# image = Image.open(IMAGE_PATH)
# model = load_layout_model()
# processor = load_layout_processor()
# det_model = load_det_model()
# det_processor = load_det_processor()
#
# # layout_predictions is a list of dicts, one per image
# line_predictions = batch_text_detection([image], det_model, det_processor)
# layout_predictions = batch_layout_detection([image], model, processor, line_predictions)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
layout_model = load_layout_model()
layout_processor = load_layout_processor()
det_model = load_det_model()
det_processor = load_det_processor()


settings = Settings()
settings

Loaded layout model datalab-to/surya_layout on device cuda with dtype torch.float16
Loaded detection model vikp/surya_det3 on device cuda with dtype torch.float16


Settings(TORCH_DEVICE=None, IMAGE_DPI=96, IMAGE_DPI_HIGHRES=192, IN_STREAMLIT=False, ENABLE_EFFICIENT_ATTENTION=True, ENABLE_CUDNN_ATTENTION=False, FLATTEN_PDF=True, DATA_DIR='data', RESULT_DIR='results', BASE_DIR='/home/cxiang/miniconda3/envs/layout/lib/python3.10/site-packages', FONT_DIR='/home/cxiang/miniconda3/envs/layout/lib/python3.10/site-packages/static/fonts', DETECTOR_BATCH_SIZE=None, DETECTOR_MODEL_CHECKPOINT='vikp/surya_det3', DETECTOR_BENCH_DATASET_NAME='vikp/doclaynet_bench', DETECTOR_IMAGE_CHUNK_HEIGHT=1400, DETECTOR_TEXT_THRESHOLD=0.6, DETECTOR_BLANK_THRESHOLD=0.35, DETECTOR_POSTPROCESSING_CPU_WORKERS=8, DETECTOR_MIN_PARALLEL_THRESH=3, COMPILE_DETECTOR=False, RECOGNITION_MODEL_CHECKPOINT='vikp/surya_rec2', RECOGNITION_MAX_TOKENS=175, RECOGNITION_BATCH_SIZE=None, RECOGNITION_IMAGE_SIZE={'height': 256, 'width': 896}, RECOGNITION_RENDER_FONTS={'all': '/home/cxiang/miniconda3/envs/layout/lib/python3.10/site-packages/static/fonts/GoNotoCurrent-Regular.ttf', 'zh': '/home/cxia

In [3]:
def get_page_image(pdf_file, dpi=settings.IMAGE_DPI_HIGHRES):
    # stream = io.BytesIO(pdf_file.getvalue())
    doc = pypdfium2.PdfDocument(pdf_file)
    png_images = []
    for page in range(len(doc)):
        renderer = doc.render(
            pypdfium2.PdfBitmap.to_pil,
            page_indices=[page],
            scale=dpi / 72,
        )
        png = list(renderer)[0]
        png_image = png.convert("RGB")
        png_images.append(png_image)
    doc.close()
    return png_images

In [4]:
png_images = get_page_image(input_file)
len(png_images)

65

In [5]:
png_images = get_page_image(input_file)
# line_predictions = batch_text_detection(png_images, det_model, det_processor)
preds = batch_layout_detection(png_images, layout_model, layout_processor)
preds[:2]

Recognizing layout: 100%|██████████| 7/7 [00:05<00:00,  1.17it/s]


[LayoutResult(bboxes=[LayoutBox(polygon=[[1308.224609375, 141.5771484375], [1486.9013671875, 141.5771484375], [1486.9013671875, 331.640625], [1308.224609375, 331.640625]], confidence=0.99951171875, label='Picture', position=0, top_k={'Picture': 0.99951171875, 'SectionHeader': 9.47713851928711e-05, 'Text': 8.118152618408203e-05, 'PageHeader': 3.272294998168945e-05, 'Handwriting': 1.8537044525146484e-05}, bbox=[1308.224609375, 141.5771484375, 1486.9013671875, 331.640625]), LayoutBox(polygon=[[195.767578125, 460.83984375], [1477.5791015625, 460.83984375], [1477.5791015625, 837.890625], [195.767578125, 837.890625]], confidence=0.9990234375, label='SectionHeader', position=1, top_k={'SectionHeader': 0.9990234375, 'Picture': 0.0006933212280273438, 'Text': 0.00018799304962158203, 'Handwriting': 7.271766662597656e-06, 'Figure': 3.993511199951172e-06}, bbox=[195.767578125, 460.83984375, 1477.5791015625, 837.890625]), LayoutBox(polygon=[[3.495849609375, 909.375], [1594.107421875, 909.375], [1594

- bboxes - detected bounding boxes for text
- bbox - the axis-aligned rectangle for the text line in (x1, y1, x2, y2) format. (x1, y1) is the top left corner, and (x2, y2) is the bottom right corner.
- polygon - the polygon for the text line in (x1, y1), (x2, y2), (x3, y3), (x4, y4) format. The points are in clockwise order from the top left.
- position - the reading order of the box.
- label - the label for the bbox. One of Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Figure, Section-header, - - - Table, Form, Table-of-contents, Handwriting, Text, Text-inline-math.
- top_k - the top-k other potential labels for the box. A dictionary with labels as keys and confidences as values.
- page - the page number in the file
- image_bbox - the bbox for the image in (x1, y1, x2, y2) format. (x1, y1) is the top left corner, and (x2, y2) is the bottom right corner. All line bboxes will be contained within this bbox.

In [6]:
import numpy as np
import pandas as pd

doc_bbs = []
doc_images = []

for i, pred in enumerate(preds):
    image_array = np.array(png_images[i])   
    doc_images.append({
        "id": f"{file_name}_{i}",
        "page": i,
        "image": image_array,
    })

    for j, box in enumerate(pred.bboxes):
        x1, y1, x2, y2 = map(int, box.bbox)

        doc_bbs.append({
            "id": f"{file_name}_{i}_{j}",
            "page": i,
            "xyxy": box.bbox,
            "image_array": image_array[y1:y2, x1:x2],
            "type_str": box.label,
            "type_conf": box.confidence,
        })

df_bbs = pd.DataFrame(doc_bbs)
df_bbs.to_pickle(f"results/{file_name}_bbs.pkl")

df_images = pd.DataFrame(doc_images)
df_images.to_pickle(f"results/{file_name}.pkl")