In [22]:
import os
import sys
import yaml
import pymupdf as fitz
import pandas as pd
from tqdm import tqdm

from layout.layout_detect_yolo import LayoutDetectionYOLO

## Convert PDF to Images


In [23]:
file_name = "EngineeringatHKUACenturyofExcellence"
input_pdf = f"inputs/{file_name}.pdf"
pdf = fitz.open(input_pdf)

if file_name not in os.listdir("images"):
    os.mkdir(f"images/{file_name}")

image_list = []

for page in tqdm(pdf, desc="Converting PDF to images"):
    path_out = f"images/{file_name}/{page.number}.png"

    if os.path.exists(path_out):
        image_list.append(path_out)
        continue

    pix = page.get_pixmap()
    pix.save(path_out)
    image_list.append(path_out)

Converting PDF to images: 100%|██████████| 65/65 [00:08<00:00,  8.03it/s]


## Layout Prediction


In [24]:
with open('configs/layout_yolo.yaml') as f:
    config = yaml.safe_load(f)

layout_detector = LayoutDetectionYOLO(config["model_config"])
predictions = layout_detector.predict(image_list, f"results/{file_name}")

Predicting layout:   0%|          | 0/65 [00:00<?, ?it/s]

Predicting layout: 100%|██████████| 65/65 [00:03<00:00, 16.64it/s]


In [25]:
predictions

[doclayout_yolo.engine.results.Results object with attributes:
 
 boxes: doclayout_yolo.engine.results.Boxes object
 keypoints: None
 masks: None
 names: {0: 'title', 1: 'plain text', 2: 'abandon', 3: 'figure', 4: 'figure_caption', 5: 'table', 6: 'table_caption', 7: 'table_footnote', 8: 'isolate_formula', 9: 'formula_caption'}
 obb: None
 orig_img: array([[[255, 255, 255],
         [255, 255, 255],
         [255, 255, 255],
         ...,
         [255, 255, 255],
         [255, 255, 255],
         [255, 255, 255]],
 
        [[223, 183, 114],
         [224, 183, 113],
         [223, 183, 114],
         ...,
         [172,  98,  60],
         [172,  98,  60],
         [172,  99,  62]],
 
        [[219, 173,  96],
         [220, 172,  92],
         [219, 172,  93],
         ...,
         [160,  75,  31],
         [160,  75,  31],
         [160,  76,  33]],
 
        ...,
 
        [[ 65, 156, 114],
         [ 65, 171, 127],
         [ 67, 138,  95],
         ...,
         [ 75, 154,  93]

## Save Bounding Boxes


In [26]:
doc_bbs = []
doc_images = []

print(predictions[0].names)

for i, prediciton in enumerate(predictions):
    type_map = prediciton.names
    image_array = prediciton.orig_img
    doc_images.append({
        "id": f"{file_name}_{i}",
        "page": i,
        "image": image_array
    })

    for j, box in enumerate(prediciton.boxes):
        x1, y1, x2, y2 = map(int, box.xyxy.tolist()[0])

        doc_bbs.append({
            "id": f"{file_name}_{i}_{j}",
            "page": i,
            "xyxy": box.xyxy.tolist(),
            "image_array": image_array[y1:y2, x1:x2],
            "type": int(box.cls),
            "type_str": type_map[int(box.cls)],
            "type_conf": float(box.conf),
        })

df_bbs = pd.DataFrame(doc_bbs)
df_bbs.to_pickle(f"results/{file_name}_bbs.pkl")

df_images = pd.DataFrame(doc_images)
df_images.to_pickle(f"results/{file_name}.pkl")

{0: 'title', 1: 'plain text', 2: 'abandon', 3: 'figure', 4: 'figure_caption', 5: 'table', 6: 'table_caption', 7: 'table_footnote', 8: 'isolate_formula', 9: 'formula_caption'}


## OCR


In [27]:
from PIL import Image
from surya.ocr import run_ocr
from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
from surya.model.recognition.model import load_model as load_rec_model
from surya.model.recognition.processor import load_processor as load_rec_processor

langs = ["en", "zh"] # Replace with your languages - optional but recommended
det_processor, det_model = load_det_processor(), load_det_model()
rec_model, rec_processor = load_rec_model(), load_rec_processor()

text_records = df_bbs[df_bbs["type"].isin([0, 1, 4, 6, 7, 9])]
df_bbs.loc[text_records.index, "image"] = text_records["image_array"].apply(lambda x: Image.fromarray(x))
images_to_process = df_bbs.loc[text_records.index, "image"].tolist()

results = run_ocr(images_to_process, [langs] * len(images_to_process), det_model,det_processor, rec_model, rec_processor)
df_bbs.loc[text_records.index, "raw_results"] = results

def process_results(result):
    if not result:
        return ""
    text = ""
    for line in result.text_lines:
        text += line.text
    return text

df_bbs["ocr_text"] = df_bbs.loc[text_records.index, "raw_results"].apply(process_results)
df_bbs.head(5)

Loaded detection model vikp/surya_det3 on device cuda with dtype torch.float16
Loaded recognition model vikp/surya_rec2 on device cuda with dtype torch.float16


Detecting bboxes: 100%|██████████| 29/29 [01:14<00:00,  2.58s/it]
Recognizing Text: 100%|██████████| 26/26 [01:06<00:00,  2.56s/it]


Unnamed: 0,id,page,xyxy,image_array,type,type_str,type_conf,image,raw_results,ocr_text
0,EngineeringatHKUACenturyofExcellence_0_0,0,"[[71.64120483398438, 211.74551391601562, 558.4...","[[[220, 166, 82], [220, 166, 82], [220, 166, 8...",0,title,0.89534,<PIL.Image.Image image mode=RGB size=487x104 a...,"text_lines=[TextLine(polygon=[[0.0, 4.0], [379...",A CENTURY ofEXCELLENCE
1,EngineeringatHKUACenturyofExcellence_0_1,0,"[[491.33746337890625, 50.51664733886719, 560.2...","[[[163, 81, 38], [163, 81, 38], [163, 81, 38],...",2,abandon,0.870346,,,
2,EngineeringatHKUACenturyofExcellence_0_2,0,"[[71.7824478149414, 169.73484802246094, 382.60...","[[[219, 166, 84], [219, 165, 81], [219, 165, 8...",0,title,0.740614,<PIL.Image.Image image mode=RGB size=311x33 at...,"text_lines=[TextLine(polygon=[[0.0, 5.0], [310...",ENGINEERING AT HKU
3,EngineeringatHKUACenturyofExcellence_0_3,0,"[[0.0, 339.5333251953125, 597.0, 717.269165039...","[[[231, 186, 101], [232, 187, 100], [232, 187,...",3,figure,0.322671,,,
4,EngineeringatHKUACenturyofExcellence_1_0,1,"[[94.47953033447266, 324.7358093261719, 502.51...","[[[238, 238, 238], [238, 238, 238], [238, 238,...",0,title,0.821725,<PIL.Image.Image image mode=RGB size=408x87 at...,"text_lines=[TextLine(polygon=[[0.0, 3.0], [320...",A CENTURY of EXCELLENCE


In [28]:
# # from paddleocr import PaddleOCR
# # ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)

# with tqdm(total=df_bbs.shape[0], desc="OCR") as pbar:
#     for i, row in df_bbs.iterrows():
#         pbar.update(1)

#         if "image_array" not in row or row["type"] in [3] or row["image_array"].shape[0] < 1 or row["image_array"].shape[1] < 1:
#             continue

#         results = ocr.ocr(row["image_array"], cls=True)

#         for line in results:
#             if line:
#                 for word in line:
#                     string += f" {word[1][0]}"

#         pbar.set_postfix({"Text": string[:10]})
#         df_bbs.at[i, "ocr_text"] = string

# df_bbs["ocr_text"].head()

In [29]:
df_bbs[df_bbs["type"] == 4].head()

Unnamed: 0,id,page,xyxy,image_array,type,type_str,type_conf,image,raw_results,ocr_text
53,EngineeringatHKUACenturyofExcellence_7_8,7,"[[713.3155517578125, 790.94580078125, 795.4211...","[[[255, 255, 255], [255, 255, 255], [255, 255,...",4,figure_caption,0.588368,<PIL.Image.Image image mode=RGB size=82x11 at ...,"text_lines=[TextLine(polygon=[[0.0, 2.0], [82....","Main Building, orca 1912."
80,EngineeringatHKUACenturyofExcellence_8_26,8,"[[226.84762573242188, 788.9278564453125, 292.4...","[[[255, 255, 255], [255, 255, 255], [255, 255,...",4,figure_caption,0.535088,<PIL.Image.Image image mode=RGB size=66x10 at ...,"text_lines=[TextLine(polygon=[[0.0, 2.0], [65....",Sir Frederick Lugard
81,EngineeringatHKUACenturyofExcellence_8_27,8,"[[292.6473693847656, 517.6408081054688, 540.46...","[[[255, 255, 255], [255, 255, 255], [255, 255,...",4,figure_caption,0.494239,<PIL.Image.Image image mode=RGB size=248x10 at...,"text_lines=[TextLine(polygon=[[0.0, 2.0], [247...",The lawing of the University of Hong Kong's fo...
90,EngineeringatHKUACenturyofExcellence_9_4,9,"[[75.73997497558594, 388.95330810546875, 339.9...","[[[204, 224, 236], [205, 226, 238], [205, 225,...",4,figure_caption,0.834681,<PIL.Image.Image image mode=RGB size=264x14 at...,"text_lines=[TextLine(polygon=[[0.0, 2.0], [263...",Planned layout of Main Building's Ground Floor...
94,EngineeringatHKUACenturyofExcellence_9_8,9,"[[309.320068359375, 800.50048828125, 467.70407...","[[[207, 228, 239], [204, 226, 238], [202, 222,...",4,figure_caption,0.484693,<PIL.Image.Image image mode=RGB size=158x13 at...,"text_lines=[TextLine(polygon=[[0.0, 1.0], [157...",Strength of Materials Laboratory


In [30]:
df_bbs.to_pickle(f"results/{file_name}_ocr.pkl")