In [None]:
import os
import yaml
import fitz
import pandas as pd

from tqdm import tqdm
from layout.layout_detect_yolo import LayoutDetectionYOLO

## Convert PDF to Images

In [7]:
file_name = "75YearsofEngineering"
input_pdf = f"inputs/{file_name}.pdf"
pdf = fitz.open(input_pdf) 

if file_name not in os.listdir("images"):
    os.mkdir(f"images/{file_name}")

image_list = []

for page in tqdm(pdf, desc="Converting PDF to images"):
    path_out = f"images/{file_name}/{page.number}.png"

    if os.path.exists(path_out):
        image_list.append(path_out)
        continue

    pix = page.get_pixmap()
    pix.save(path_out)
    image_list.append(path_out)
    

Converting PDF to images: 100%|██████████| 301/301 [00:00<00:00, 10217.43it/s]


## Layout Prediction

In [8]:
with open('configs/layout_yolo.yaml') as f:
    config = yaml.safe_load(f)
    
layout_detector = LayoutDetectionYOLO(config["model_config"])
predictions = layout_detector.predict(image_list, f"results/{file_name}")

Predicting layout: 100%|██████████| 301/301 [00:10<00:00, 27.41it/s]


## Save Bounding Boxes

In [9]:
doc_bbs = []
doc_images = []

print(predictions[0].names)

for i, prediciton in enumerate(predictions):
    type_map = prediciton.names
    image_array = prediciton.orig_img
    doc_images.append({
        "id": f"{file_name}_{i}",
        "page": i,
        "image": image_array
    })
    
    for j, box in enumerate(prediciton.boxes):
        x1, y1, x2, y2 = map(int, box.xyxy.tolist()[0])

        doc_bbs.append({
            "id": f"{file_name}_{i}_{j}",
            "page": i,
            "xyxy": box.xyxy.tolist(),
            "image_array": image_array[y1:y2, x1:x2],
            "type": int(box.cls),
            "type_str": type_map[int(box.cls)],
            "type_conf": float(box.conf),
        })
        
df_bbs = pd.DataFrame(doc_bbs)
df_bbs.to_pickle(f"results/{file_name}_bbs.pkl")

df_images = pd.DataFrame(doc_images)
df_images.to_pickle(f"results/{file_name}.pkl")

{0: 'title', 1: 'plain text', 2: 'abandon', 3: 'figure', 4: 'figure_caption', 5: 'table', 6: 'table_caption', 7: 'table_footnote', 8: 'isolate_formula', 9: 'formula_caption'}


## OCR

In [10]:
from paddleocr import PaddleOCR

ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)

with tqdm(total=df_bbs.shape[0], desc="OCR") as pbar: 
    for i, row in df_bbs.iterrows():
        pbar.update(1)

        if "image_array" not in row or row["type"] in [3, 4, 6, 9] or row["image_array"].shape[0] < 1 or row["image_array"].shape[1] < 1:
            continue

        results = ocr.ocr(row["image_array"], cls=True)
        string = ""
        for line in results:
            if line:
                for word in line:
                    string += f" {word[1][0]}"

        pbar.set_postfix({"Text": string[:10]})
        df_bbs.at[i, "ocr_text"] = string

print(df_bbs["ocr_text"].head())

df_bbs.to_pickle(f"results/{file_name}_ocr.pkl")
df_bbs.to_excel(f"results/{file_name}_ocr.xlsx")

OCR: 100%|██████████| 1987/1987 [05:27<00:00,  6.06it/s, Text=]         


0                         NaN
1                            
2     75 Years of Engineering
3     75 Years of Engineering
4                         NaN
Name: ocr_text, dtype: object
