In [1]:
# !pip install -q pdf2image ultralytics supervision

In [2]:
PDF_PATH = "../data/dummy_scanned.pdf"

In [3]:
# Folder structure setup
import os

# if any img directory remains, remove it
if os.path.exists("img"):
    import shutil

    shutil.rmtree("img")
    
# create the img directory
BASE_DIR = ""
DIRECTORIES = ["img/pages", "img/annotated", "img/figures"]
for dir in DIRECTORIES:
    os.makedirs(os.path.join(BASE_DIR, dir), exist_ok=True)

In [4]:
# handle image size
def resize_img(image, width=1440):
    # change the image to RGB mode
    image = image.convert('RGB')

    # change image width and maintain the aspect ratio
    basewidth = width
    wpercent = (basewidth / float(image.size[0]))

    # change the height of the image
    hsize = int((float(image.size[1]) * float(wpercent)))

    # resize the image
    image = image.resize((basewidth, hsize))

    return image

In [5]:
from pdf2image import convert_from_path

img_path = "img"
page_path = f"{img_path}/pages"


# Function to split PDF into images
def split_pdf(pdf_path):
    pages = []
    page_images = convert_from_path(pdf_path, 400)
    for i, page in enumerate(page_images):
        page = resize_img(image=page, width=1440)
        # grayscale the image
        # page = page.convert("L")
        # save the image
        path = f"{page_path}/page_{i}.png"
        page.save(path, "PNG")
        pages.append(path)

    return pages

pages = split_pdf(PDF_PATH)
pages

['img/pages/page_0.png',
 'img/pages/page_1.png',
 'img/pages/page_2.png',
 'img/pages/page_3.png',
 'img/pages/page_4.png',
 'img/pages/page_5.png',
 'img/pages/page_6.png']

In [6]:
import cv2
import numpy as np
from ultralytics import YOLO
import supervision as sv

fig_path = f"{img_path}/figures"

# detect text, table, and figure using YOLOv11 model
def detect_text(pages, model=YOLO("../model/yolo11_best.pt")):
    detected_images = []
    annotated_images =[]
    figures = []
    for j, page in enumerate(pages):
        image = cv2.imread(page)
        results = model(image, conf=0.35, iou=0.7)[0]
        detections = sv.Detections.from_ultralytics(results)
        detected_images.append(detections)

        # save annotated image
        annotated_image = image.copy()
        annotated_image = sv.BoxAnnotator().annotate(scene=annotated_image, detections=detections)
        annotated_image = sv.LabelAnnotator().annotate(scene=annotated_image, detections=detections)
        output_image_path = f"img/annotated/page_{j+1}_annotated.png"
        cv2.imwrite(output_image_path, annotated_image)
        annotated_images.append(output_image_path)

    return detected_images, annotated_images, figures

In [7]:
detected_images, annotated_images, figures = detect_text(pages)
detected_images[0]


0: 640x480 2 figures, 4 texts, 1333.2ms
Speed: 150.5ms preprocess, 1333.2ms inference, 175.2ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 4 texts, 311.5ms
Speed: 3.6ms preprocess, 311.5ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 3 figures, 2 tables, 5 texts, 228.7ms
Speed: 3.6ms preprocess, 228.7ms inference, 87.2ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 2 figures, 7 texts, 304.7ms
Speed: 22.4ms preprocess, 304.7ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 480)

0: 640x512 3 tables, 286.1ms
Speed: 3.6ms preprocess, 286.1ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 table, 315.9ms
Speed: 4.8ms preprocess, 315.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 512)

0: 480x640 1 figure, 198.9ms
Speed: 3.9ms preprocess, 198.9ms inference, 1.2ms postprocess per image at shape (1, 3, 480, 640)


Detections(xyxy=array([[     51.744,      1482.3,      831.68,      1794.4],
       [      51.57,      1196.7,        1379,      1450.4],
       [     859.08,      1464.7,      1369.8,      1885.4],
       [     77.871,      127.42,      1242.6,      299.47],
       [     52.838,      502.84,      830.21,      723.21],
       [     903.59,       603.6,      1375.6,      1014.9]], dtype=float32), mask=None, confidence=array([    0.89725,     0.85007,     0.77208,     0.67358,     0.43092,     0.42072], dtype=float32), class_id=array([2, 2, 0, 2, 2, 0]), tracker_id=None, data={'class_name': array(['text', 'text', 'figure', 'text', 'text', 'figure'], dtype='<U6')}, metadata={})