In [None]:
import os
import yaml
import fitz
import pandas as pd
from tqdm import tqdm


from layout.layout_detect_yolo import LayoutDetectionYOLO

## Convert PDF to Images

In [None]:
file_name = "75YearsofEngineering"
input_pdf = f"inputs/{file_name}.pdf"
pdf = fitz.open(input_pdf) 

if file_name not in os.listdir("images"):
    os.mkdir(f"images/{file_name}")

image_list = []

for page in tqdm(pdf, desc="Converting PDF to images"):
    path_out = f"images/{file_name}/{page.number}.png"

    if os.path.exists(path_out):
        image_list.append(path_out)
        continue

    pix = page.get_pixmap()
    pix.save(path_out)
    image_list.append(path_out)
    

## Layout Prediction

In [None]:
with open('configs/layout_yolo.yaml') as f:
    config = yaml.safe_load(f)
    
layout_detector = LayoutDetectionYOLO(config["model_config"])
predictions = layout_detector.predict(image_list, f"results/{file_name}")

## Save Bounding Boxes

In [None]:
doc_bbs = []
doc_images = []

print(predictions[0].names)

for i, prediciton in enumerate(predictions):
    type_map = prediciton.names
    image_array = prediciton.orig_img
    doc_images.append({
        "id": f"{file_name}_{i}",
        "page": i,
        "image": image_array
    })
    
    for j, box in enumerate(prediciton.boxes):
        x1, y1, x2, y2 = box.xyxy.int().tolist()[0]
        doc_bbs.append({
            "id": f"{file_name}_{i}_{j}",
            "page": i,
            "xyxy": box.xyxy.tolist(),
            "image_array": image_array[x1:x2, y1:y2],
            "type": int(box.cls),
            "type_str": type_map[int(box.cls)],
            "type_conf": float(box.conf),
        })
        
df_bbs = pd.DataFrame(doc_bbs)
df_bbs.to_pickle(f"results/{file_name}_bbs.pkl")

df_images = pd.DataFrame(doc_images)
df_images.to_pickle(f"results/{file_name}.pkl")

## OCR

In [None]:
from paddleocr import PaddleOCR

ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)

with tqdm(total=df_bbs.shape[0], desc="OCR") as pbar: 
    for i, row in df_bbs.iterrows():
        pbar.update(1)

        if "image_array" not in row or row["type"] in [3, 4, 6, 9] or row["image_array"].shape[0] < 1 or row["image_array"].shape[1] < 1:
            continue

        results = ocr.ocr(row["image_array"], cls=True)
        string = ""
        for line in results:
            if line:
                for word in line:
                    string += word[1][0]

        pbar.set_postfix({"Text": string[:10]})
        df_bbs.at[i, "ocr_text"] = string

print(df_bbs["ocr_text"].head())

df_bbs.to_pickle(f"results/{file_name}_ocr.pkl")
df_bbs.to_excel(f"results/{file_name}_ocr.xlsx")