Skip to content

Not able to fetch all tables and figures when converting pdf into images #25

@reema93jain

Description

@reema93jain

Hi Team,

I am using layoutparser for detecting tables and images.
When I just try to run code on individual png image file, model detects tables and figures correctly.
However, when I am using below code to convert pdf into images and detecting tables out of each page image, I am either not getting full image/table or sometimes get duplicates tables as well.

Can you please guide on how to refine below code and what I can try to resolve this issue? Thank you!

!pip install layoutparser
!pip install opencv-python numpy matplotlib

install detectron2:

!pip install 'git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2'
!pip3 install pdf2image
!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!apt-get install poppler-utils

import os
from pdf2image import convert_from_path
import shutil
import cv2

import layoutparser as lp

PubLayNet

model = lp.models.Detectron2LayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.81],
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"})

def save_detections(table_blocks, image, image_name, save_dir='/content/'):
for j in range(len(table_blocks)):
x_1, y_1, x_2, y_2 = table_blocks[0].block.x_1, table_blocks[0].block.y_1, table_blocks[0].block.x_2, table_blocks[0].block.y_2
cropped = image[int(y_1):int(y_2), int(x_1):int(x_2)]
cv2_imshow(cropped)
file_name = image_name+'_'+str(j)+'.jpg'
status = cv2.imwrite(save_dir+file_name, cropped)
if status:
print("Saved ", file_name)

def inference(images_dir):
table_blocks_list = []
# Getting images from the directory
for file in os.listdir(images_dir):
if file.endswith(".jpg"):
# Extract the image name (excluding the extension)
image_name = file[:-4]
# # Reading the image using OpenCV
image = cv2.imread(images_dir+'/'+file)
# OpenCV reads images in BGR format, convert to RGB
image = image[..., ::-1]
# Running Inference
layout = model.detect(image)

        # Extracting Tables
        table_blocks = lp.Layout([b for b in layout if b.type=="Table"])
        figure_blocks = lp.Layout([b for b in layout if b.type=='Figure'])

        table_blocks = lp.Layout([b for b in table_blocks \
               if not any(b.is_in(b_fig) for b_fig in figure_blocks)])
        h, w = image.shape[:2]

        left_interval = lp.Interval(0, w/2*1.05, axis='x').put_on_canvas(image)

        left_blocks = table_blocks.filter_by(left_interval, center=True)
        left_blocks.sort(key = lambda b:b.coordinates[1])

        right_blocks = [b for b in table_blocks if b not in left_blocks]
        right_blocks.sort(key = lambda b:b.coordinates[1])

        # And finally combine the two list and add the index
        # according to the order
        table_blocks = lp.Layout([b.set(id = idx) for idx, b in enumerate(left_blocks + right_blocks)])

        save_detections(table_blocks, image, image_name)

        table_blocks_list.append(table_blocks)
return table_blocks_list

def pdf_inference(pdfName):
# Converting each page to an image
# Get the current working directory
path = os.getcwd()
# Construct the full path to the PDF file
PDF_file = path + "/" + pdfName
# Create a directory to store converted images
if os.path.exists(path+'/pdf_images'):
shutil.rmtree(path+'/pdf_images')
os.mkdir(path+'/pdf_images')

# Convert each page of the PDF to an image
pages = convert_from_path(PDF_file, dpi=100, grayscale=True)
image_counter = 1

# Iterate over the pages
for page in pages:
    filename = "page_"+str(image_counter)+".jpg"

    # st.write(filename)
    filepath = path+"/pdf_images/" + filename
    
    # Save the page as a JPEG image in the 'pdf_images' directory
    page.save(f'{filepath}', 'JPEG')
    image_counter = image_counter + 1

#filelimit = image_counter-1

# Running inference on the images
table_blocks_list = inference(path+'/pdf_images')

#return table_blocks_list

test = pdf_inference('abc-Datasheet.pdf')

Thanks
Reema Jain

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions