# Installations

Installation Function to Page wise Image Conversion

In [None]:
# %pip install pymupdf

Installation Function to Cropping Tables

In [None]:
# %%capture
# %pip install "unstructured[all-docs]"

installations for Function to Recognise and format tables as csv

In [None]:
# %pip install paddlepaddle paddleocr tensorflow pandas

# Imports

In [4]:
import os
import pandas as pd
import fitz
import cv2
from unstructured.partition.image import partition_image
import numpy as np
import tensorflow as tf
from paddleocr import PaddleOCR


# Define Directories

In [None]:
input_dir = "PDFs"
output_dir = "RESULTS" 
deb_output_dir = "DEBUG_IMGs"

# Unstructured Tables Extraction (OCR Based)

Function to Page wise Image Conversion

In [None]:
def extract_pdf_pages(input_dir, output_base_folder="INPUTS\PDF IMAGEs\Pages", dpi=300):


    os.makedirs(output_base_folder, exist_ok=True)
    pdf_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.pdf')]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(input_dir, pdf_file)
        pdf_name = os.path.splitext(pdf_file)[0]
        pdf_output_folder = os.path.join(output_base_folder, f"{pdf_name}-Pages")
        os.makedirs(pdf_output_folder, exist_ok=True)

        pdf_document = fitz.open(pdf_path)
        page_count = pdf_document.page_count
        print(f"Processing '{pdf_file}' ({page_count} pages)...")

        for page_number in range(page_count):
            page = pdf_document[page_number]
            pix = page.get_pixmap(dpi=dpi, colorspace=fitz.csRGB)  # Render at specified DPI in RGB
            image_path = os.path.join(pdf_output_folder, f'page_{page_number + 1}.png')
            pix.save(image_path)
            print(f'Saved: {image_path}')

        pdf_document.close()
        print(f"Finished processing '{pdf_file}'! Pages saved in '{pdf_output_folder}'\n")

    print("\n All PDFs processed successfully! Extracted pages are stored in '{output_base_folder}'.")

extract_pdf_pages(input_dir)

Function to Cropping Tables

In [None]:
# from unstructured.partition.image import partition_image

def extract_and_crop_tables(input_base_folder, output_base_folder="INPUTS\PDF IMAGEs\Cropped Tables", top_left_padding=5, bottom_right_padding=7):

    os.makedirs(output_base_folder, exist_ok=True)

    subfolders = [f for f in os.listdir(input_base_folder) if os.path.isdir(os.path.join(input_base_folder, f))]

    for subfolder in subfolders:
        input_folder = os.path.join(input_base_folder, subfolder)
        output_folder = os.path.join(output_base_folder, f"{subfolder}-Cropped")

        # Ensure the output folder exists for each subfolder
        os.makedirs(output_folder, exist_ok=True)

        # Function to crop an image with padding and save it
        def crop_with_padding(image_path, coordinates, output_folder, filename):
            """Crop the specified coordinates from the image, apply padding, and save it."""
            image = cv2.imread(image_path)
            if image is None:
                print(f"Error loading image: {image_path}")
                return

            x_min = int(min(pt[0] for pt in coordinates))
            y_min = int(min(pt[1] for pt in coordinates))
            x_max = int(max(pt[0] for pt in coordinates))
            y_max = int(max(pt[1] for pt in coordinates))

            # Apply top-left and bottom-right padding
            x_min_padded = max(0, x_min - top_left_padding)
            y_min_padded = max(0, y_min - top_left_padding)
            x_max_padded = min(image.shape[1], x_max + bottom_right_padding * 2)
            y_max_padded = min(image.shape[0], y_max + bottom_right_padding)

            # Crop the image
            cropped_image = image[y_min_padded:y_max_padded, x_min_padded:x_max_padded]

            # Save cropped image
            output_path = os.path.join(output_folder, filename)
            cv2.imwrite(output_path, cropped_image)
            print(f"Cropped image saved to: {output_path}")

        # Dictionary to count tables per image
        table_counter = {}

        # Get all image files in the input folder
        image_files = [f for f in os.listdir(input_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

        # Process each image file
        for image_file in image_files:
            image_path = os.path.join(input_folder, image_file)
            image_name = os.path.splitext(image_file)[0]  # Extract image name without extension

            # Perform table detection using Unstructured's partition_image
            elements = partition_image(filename=image_path, infer_table_structure=True, strategy='hi_res')

            # Convert elements to JSON structure
            element_dict = [el.to_dict() for el in elements]

            # Extract and crop tables
            for item in element_dict:
                if isinstance(item, dict) and item.get("type") == "Table":
                    try:
                        coordinates = item["metadata"]["coordinates"]["points"]

                        # Count tables per image
                        table_counter[image_name] = table_counter.get(image_name, 0) + 1
                        table_number = table_counter[image_name]

                        # Set output filename for cropped table image
                        filename = f"{image_name}_Table_{table_number}.png"

                        # Crop and save the table
                        crop_with_padding(image_path, coordinates, output_folder, filename)

                    except KeyError as e:
                        print(f"Missing key {e} in item: {item}")

        print(f"\n Finished processing folder: {subfolder}! Cropped tables saved in: {output_folder}")

    print("\n All image folders processed successfully! Cropped tables are stored in:", output_base_folder)

input_directory = "INPUTS\PDF IMAGEs\Pages"
output_directory = "INPUTS\PDF IMAGEs\Cropped Tables"

extract_and_crop_tables(input_directory, output_directory)

Function to Recognise and format tables as csv

In [8]:
def extract_tables_from_images(image_folder, output_folder="Extracted Tables"):

    ocr = PaddleOCR(lang='en')

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Function to compute intersection of two bounding boxes
    def intersection(box_1, box_2):
        return [box_2[0], box_1[1], box_2[2], box_1[3]]

    # Function to compute IoU (Intersection over Union)
    def iou(box_1, box_2):
        x_1 = max(box_1[0], box_2[0])
        y_1 = max(box_1[1], box_2[1])
        x_2 = min(box_1[2], box_2[2])
        y_2 = min(box_1[3], box_2[3])

        inter = abs(max((x_2 - x_1, 0)) * max((y_2 - y_1), 0))
        if inter == 0:
            return 0

        box_1_area = abs((box_1[2] - box_1[0]) * (box_1[3] - box_1[1]))
        box_2_area = abs((box_2[2] - box_2[0]) * (box_2[3] - box_2[1]))

        return inter / float(box_1_area + box_2_area - inter)

    # Get all image files from the folder
    image_files = [f for f in os.listdir(image_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

    # Process each image
    for image_file in image_files:
        image_path = os.path.join(image_folder, image_file)
        image_cv = cv2.imread(image_path)

        # Ensure the image was loaded correctly
        if image_cv is None:
            print(f"Error loading image: {image_path}")
            continue

        image_height, image_width = image_cv.shape[:2]

        # Perform OCR
        output = ocr.ocr(image_path)[0]

        # Extract bounding boxes, detected text, and confidence scores
        boxes = [line[0] for line in output]
        texts = [line[1][0] for line in output]
        probabilities = [line[1][1] for line in output]

        # Copy image for processing
        im = image_cv.copy()

        horiz_boxes = []
        vert_boxes = []

        # Generate horizontal and vertical bounding boxes
        for box in boxes:
            x_h, x_v = 0, int(box[0][0])
            y_h, y_v = int(box[0][1]), 0
            width_h, width_v = image_width, int(box[2][0] - box[0][0])
            height_h, height_v = int(box[2][1] - box[0][1]), image_height

            horiz_boxes.append([x_h, y_h, x_h + width_h, y_h + height_h])
            vert_boxes.append([x_v, y_v, x_v + width_v, y_v + height_v])

            cv2.rectangle(im, (x_h, y_h), (x_h + width_h, y_h + height_h), (0, 0, 255), 1)
            cv2.rectangle(im, (x_v, y_v), (x_v + width_v, y_v + height_v), (0, 255, 0), 1)

        # Apply Non-Maximum Suppression (NMS) for horizontal boxes
        horiz_out = tf.image.non_max_suppression(
            horiz_boxes, probabilities, max_output_size=1000, iou_threshold=0.1, score_threshold=float('-inf')
        )
        horiz_lines = np.sort(np.array(horiz_out))

        im_nms = image_cv.copy()

        for val in horiz_lines:
            cv2.rectangle(im_nms, (int(horiz_boxes[val][0]), int(horiz_boxes[val][1])),
                          (int(horiz_boxes[val][2]), int(horiz_boxes[val][3])), (0, 0, 255), 1)

        # Apply Non-Maximum Suppression (NMS) for vertical boxes
        vert_out = tf.image.non_max_suppression(
            vert_boxes, probabilities, max_output_size=1000, iou_threshold=0.1, score_threshold=float('-inf')
        )
        vert_lines = np.sort(np.array(vert_out))

        for val in vert_lines:
            cv2.rectangle(im_nms, (int(vert_boxes[val][0]), int(vert_boxes[val][1])),
                          (int(vert_boxes[val][2]), int(vert_boxes[val][3])), (255, 0, 0), 1)

        # Create an empty table structure
        out_array = [["" for _ in range(len(vert_lines))] for _ in range(len(horiz_lines))]

        # Sort bounding boxes based on vertical position
        unordered_boxes = [vert_boxes[i][0] for i in vert_lines]
        ordered_boxes = np.argsort(unordered_boxes)

        # Fill the table using intersection and IoU logic
        for i in range(len(horiz_lines)):
            for j in range(len(vert_lines)):
                resultant = intersection(horiz_boxes[horiz_lines[i]], vert_boxes[vert_lines[ordered_boxes[j]]])

                for b in range(len(boxes)):
                    the_box = [boxes[b][0][0], boxes[b][0][1], boxes[b][2][0], boxes[b][2][1]]
                    if iou(resultant, the_box) > 0.1:
                        out_array[i][j] = texts[b]

        # Convert to a structured array
        out_array = np.array(out_array)

        # Save extracted text and structure as a CSV file with the image filename
        csv_filename = f"{os.path.splitext(image_file)[0]}.csv"
        csv_output_path = os.path.join(output_folder, csv_filename)
        pd.DataFrame(out_array).to_csv(csv_output_path, index=False, header=False)

        print(f"Processing completed for {image_file}. Results saved in {output_folder}")

    print("\n All images processed successfully! Extracted tables saved in:", output_folder)

input_directory = "INPUTS\PDF IMAGEs\Cropped Tables"
output_directory = "PDF1 Tables 1"

extract_tables_from_images(input_directory, output_directory)

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to C:\Users\Hemant.Singhsidar/.paddleocr/whl\det\en\en_PP-OCRv3_det_infer\en_PP-OCRv3_det_infer.tar


100%|██████████| 3910/3910 [00:12<00:00, 317.55it/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to C:\Users\Hemant.Singhsidar/.paddleocr/whl\rec\en\en_PP-OCRv4_rec_infer\en_PP-OCRv4_rec_infer.tar


100%|██████████| 10000/10000 [00:16<00:00, 590.51it/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to C:\Users\Hemant.Singhsidar/.paddleocr/whl\cls\ch_ppocr_mobile_v2.0_cls_infer\ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2138/2138 [00:11<00:00, 184.58it/s]

[2025/02/11 19:28:02] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\Hemant.Singhsidar/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\Hemant.Singhsidar/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 





 All images processed successfully! Extracted tables saved in: PDF1 Tables 1
