In [21]:
# !pip install -q pdf2image ultralytics supervision

In [22]:
PDF_PATH = "../data/dummy_scanned.pdf"

In [23]:
# Folder structure setup
import os

# if any img directory remains, remove it
if os.path.exists("img"):
    import shutil

    shutil.rmtree("img")
    
# create the img directory
BASE_DIR = ""
DIRECTORIES = ["img/pages", "img/annotated", "img/figures"]
for dir in DIRECTORIES:
    os.makedirs(os.path.join(BASE_DIR, dir), exist_ok=True)

In [24]:
# handle image size
def resize_img(image, width=1440):
    # change the image to RGB mode
    image = image.convert('RGB')

    # change image width and maintain the aspect ratio
    basewidth = width
    wpercent = (basewidth / float(image.size[0]))

    # change the height of the image
    hsize = int((float(image.size[1]) * float(wpercent)))

    # resize the image
    image = image.resize((basewidth, hsize))

    return image

In [25]:
from pdf2image import convert_from_path

img_path = "img"
page_path = f"{img_path}/pages"


# Function to split PDF into images
def split_pdf(pdf_path):
    pages = []
    page_images = convert_from_path(pdf_path, 400)
    for i, page in enumerate(page_images):
        page = resize_img(image=page, width=1440)
        # grayscale the image
        # page = page.convert("L")
        # save the image
        path = f"{page_path}/page_{i}.png"
        page.save(path, "PNG")
        pages.append(path)

    return pages

pages = split_pdf(PDF_PATH)
pages

['img/pages/page_0.png',
 'img/pages/page_1.png',
 'img/pages/page_2.png',
 'img/pages/page_3.png',
 'img/pages/page_4.png',
 'img/pages/page_5.png',
 'img/pages/page_6.png']

In [26]:
import cv2
import numpy as np
from ultralytics import YOLO
import supervision as sv

fig_path = f"{img_path}/figures"

# detect text, table, and figure using YOLOv11 model
def detect_text(pages, model=YOLO("../model/new_last.pt")):
    detected_images = []
    annotated_images =[]
    figures = []
    for j, page in enumerate(pages):
        image = cv2.imread(page)
        results = model(image, conf=0.35, iou=0.7)[0]
        detections = sv.Detections.from_ultralytics(results)
        detected_images.append(detections)

        # save annotated image
        annotated_image = image.copy()
        annotated_image = sv.BoxAnnotator().annotate(scene=annotated_image, detections=detections)
        annotated_image = sv.LabelAnnotator().annotate(scene=annotated_image, detections=detections)
        output_image_path = f"img/annotated/page_{j+1}_annotated.png"
        cv2.imwrite(output_image_path, annotated_image)
        annotated_images.append(output_image_path)

    return detected_images, annotated_images, figures

In [27]:
detected_images, annotated_images, figures = detect_text(pages)
detected_images[0]


0: 640x480 1 footer, 2 graphs, 1 logo, 2 texts, 1 title, 185.4ms
Speed: 5.5ms preprocess, 185.4ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 1 header, 3 logos, 4 texts, 182.8ms
Speed: 3.7ms preprocess, 182.8ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 1 footer, 3 graphs, 1 table, 2 texts, 152.5ms
Speed: 2.5ms preprocess, 152.5ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 1 footer, 2 images, 2 logos, 1 text, 157.9ms
Speed: 3.2ms preprocess, 157.9ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 480)

0: 640x512 1 footer, 3 logos, 3 texts, 1 title, 204.3ms
Speed: 2.9ms preprocess, 204.3ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 footer, 1 table, 199.8ms
Speed: 4.5ms preprocess, 199.8ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 512)

0: 480x640 2 graphs, 2 texts, 2 titles, 164.5ms
Speed: 3.0ms preprocess, 164.5ms infer

Detections(xyxy=array([[     67.727,       328.6,      1239.3,      437.59],
       [     124.57,       132.9,      1207.4,      305.88],
       [     870.02,      1468.8,      1367.4,        1889],
       [     695.63,      1936.5,      731.59,      1978.4],
       [     913.09,      602.69,      1352.6,      1015.7],
       [     1260.4,           0,        1383,      82.137],
       [     54.716,      1265.9,      1382.7,        1795]], dtype=float32), mask=None, confidence=array([    0.77915,     0.74794,     0.47732,      0.4393,      0.4086,     0.38949,     0.36391], dtype=float32), class_id=array([7, 8, 1, 0, 1, 4, 7]), tracker_id=None, data={'class_name': array(['text', 'title', 'graph', 'footer', 'graph', 'logo', 'text'], dtype='<U6')}, metadata={})