Not able to fetch all tables and figures when converting pdf into images

Hi Team,

I am using layoutparser for detecting tables and images. 
When I just try to run code on individual png image file, model detects tables and figures correctly. 
However, when I am using below code to convert pdf into images and detecting tables out of each page image, I am either not getting full image/table or sometimes get duplicates tables as well.

Can you please guide on how to refine below code and what I can try to resolve this issue? Thank you!


!pip install layoutparser
!pip install opencv-python numpy matplotlib
# install detectron2:
!pip install 'git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2'
!pip3 install pdf2image
!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!apt-get install poppler-utils

import os
from pdf2image import convert_from_path
import shutil
import cv2

import layoutparser as lp
# PubLayNet
model = lp.models.Detectron2LayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
                                 extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.81],
                                 label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"})

def save_detections(table_blocks, image, image_name, save_dir='/content/'):
    for j in range(len(table_blocks)):
        x_1, y_1, x_2, y_2 = table_blocks[0].block.x_1, table_blocks[0].block.y_1, table_blocks[0].block.x_2, table_blocks[0].block.y_2
        cropped = image[int(y_1):int(y_2), int(x_1):int(x_2)]        
        cv2_imshow(cropped)
        file_name = image_name+'_'+str(j)+'.jpg'
        status = cv2.imwrite(save_dir+file_name, cropped)
        if status:
            print("Saved ", file_name)


def inference(images_dir):
    table_blocks_list = []
    # Getting images from the directory
    for file in os.listdir(images_dir):
        if file.endswith(".jpg"):
            # Extract the image name (excluding the extension)
            image_name = file[:-4]
            # # Reading the image using OpenCV
            image = cv2.imread(images_dir+'/'+file)
            # OpenCV reads images in BGR format, convert to RGB
            image = image[..., ::-1]
            # Running Inference
            layout = model.detect(image)

            # Extracting Tables
            table_blocks = lp.Layout([b for b in layout if b.type=="Table"])
            figure_blocks = lp.Layout([b for b in layout if b.type=='Figure'])

            table_blocks = lp.Layout([b for b in table_blocks \
                   if not any(b.is_in(b_fig) for b_fig in figure_blocks)])
            h, w = image.shape[:2]

            left_interval = lp.Interval(0, w/2*1.05, axis='x').put_on_canvas(image)

            left_blocks = table_blocks.filter_by(left_interval, center=True)
            left_blocks.sort(key = lambda b:b.coordinates[1])

            right_blocks = [b for b in table_blocks if b not in left_blocks]
            right_blocks.sort(key = lambda b:b.coordinates[1])

            # And finally combine the two list and add the index
            # according to the order
            table_blocks = lp.Layout([b.set(id = idx) for idx, b in enumerate(left_blocks + right_blocks)])

            save_detections(table_blocks, image, image_name)

            table_blocks_list.append(table_blocks)
    return table_blocks_list


def pdf_inference(pdfName):
    # Converting each page to an image
    # Get the current working directory
    path = os.getcwd()
    # Construct the full path to the PDF file
    PDF_file = path + "/" + pdfName
    # Create a directory to store converted images
    if os.path.exists(path+'/pdf_images'):
        shutil.rmtree(path+'/pdf_images')
    os.mkdir(path+'/pdf_images')

    # Convert each page of the PDF to an image
    pages = convert_from_path(PDF_file, dpi=100, grayscale=True)
    image_counter = 1
    
    # Iterate over the pages
    for page in pages:
        filename = "page_"+str(image_counter)+".jpg"

        # st.write(filename)
        filepath = path+"/pdf_images/" + filename
        
        # Save the page as a JPEG image in the 'pdf_images' directory
        page.save(f'{filepath}', 'JPEG')
        image_counter = image_counter + 1

    #filelimit = image_counter-1

    # Running inference on the images
    table_blocks_list = inference(path+'/pdf_images')

    #return table_blocks_list

test = pdf_inference('abc-Datasheet.pdf')

Thanks
Reema Jain

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Not able to fetch all tables and figures when converting pdf into images #25

install detectron2:

PubLayNet

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Not able to fetch all tables and figures when converting pdf into images #25

Description

install detectron2:

PubLayNet

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions