In [None]:
# %pip install -r recog_requirments.txt

In [None]:
import os
import cv2
import numpy as np
import tensorflow as tf
import pandas as pd
from paddleocr import PaddleOCR

ocr = PaddleOCR(lang='en')

image_folder = "PDF1 Cropped Tables"
output_folder = "PDF1 Tables"

os.makedirs(output_folder, exist_ok=True)

# Function to compute intersection of two bounding boxes
def intersection(box_1, box_2):
    return [box_2[0], box_1[1], box_2[2], box_1[3]]

# Function to compute IoU (Intersection over Union)
def iou(box_1, box_2):
    x_1 = max(box_1[0], box_2[0])
    y_1 = max(box_1[1], box_2[1])
    x_2 = min(box_1[2], box_2[2])
    y_2 = min(box_1[3], box_2[3])

    inter = abs(max((x_2 - x_1, 0)) * max((y_2 - y_1), 0))
    if inter == 0:
        return 0

    box_1_area = abs((box_1[2] - box_1[0]) * (box_1[3] - box_1[1]))
    box_2_area = abs((box_2[2] - box_2[0]) * (box_2[3] - box_2[1]))

    return inter / float(box_1_area + box_2_area - inter)

# Get all image files from the folder
image_files = [f for f in os.listdir(image_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

# Process each image
for image_file in image_files:
    image_path = os.path.join(image_folder, image_file)
    image_cv = cv2.imread(image_path)

    # Ensure the image was loaded correctly
    if image_cv is None:
        print(f"Error loading image: {image_path}")
        continue

    image_height, image_width = image_cv.shape[:2]

    # Perform OCR
    output = ocr.ocr(image_path)[0]

    # Extract bounding boxes, detected text, and confidence scores
    boxes = [line[0] for line in output]
    texts = [line[1][0] for line in output]
    probabilities = [line[1][1] for line in output]

    # # Create an output folder for the current image
    # image_output_folder = os.path.join(output_folder, os.path.splitext(image_file)[0])
    # os.makedirs(image_output_folder, exist_ok=True)

    # Copy image for processing
    im = image_cv.copy()

    horiz_boxes = []
    vert_boxes = []

    # Generate horizontal and vertical bounding boxes
    for box in boxes:
        x_h, x_v = 0, int(box[0][0])
        y_h, y_v = int(box[0][1]), 0
        width_h, width_v = image_width, int(box[2][0] - box[0][0])
        height_h, height_v = int(box[2][1] - box[0][1]), image_height

        horiz_boxes.append([x_h, y_h, x_h + width_h, y_h + height_h])
        vert_boxes.append([x_v, y_v, x_v + width_v, y_v + height_v])

        cv2.rectangle(im, (x_h, y_h), (x_h + width_h, y_h + height_h), (0, 0, 255), 1)
        cv2.rectangle(im, (x_v, y_v), (x_v + width_v, y_v + height_v), (0, 255, 0), 1)

    # Apply Non-Maximum Suppression (NMS) for horizontal boxes
    horiz_out = tf.image.non_max_suppression(
        horiz_boxes, probabilities, max_output_size=1000, iou_threshold=0.1, score_threshold=float('-inf')
    )
    horiz_lines = np.sort(np.array(horiz_out))

    im_nms = image_cv.copy()

    for val in horiz_lines:
        cv2.rectangle(im_nms, (int(horiz_boxes[val][0]), int(horiz_boxes[val][1])),
                      (int(horiz_boxes[val][2]), int(horiz_boxes[val][3])), (0, 0, 255), 1)

    # Apply Non-Maximum Suppression (NMS) for vertical boxes
    vert_out = tf.image.non_max_suppression(
        vert_boxes, probabilities, max_output_size=1000, iou_threshold=0.1, score_threshold=float('-inf')
    )
    vert_lines = np.sort(np.array(vert_out))

    for val in vert_lines:
        cv2.rectangle(im_nms, (int(vert_boxes[val][0]), int(vert_boxes[val][1])),
                      (int(vert_boxes[val][2]), int(vert_boxes[val][3])), (255, 0, 0), 1)

    # Create an empty table structure
    out_array = [["" for _ in range(len(vert_lines))] for _ in range(len(horiz_lines))]

    # Sort bounding boxes based on vertical position
    unordered_boxes = [vert_boxes[i][0] for i in vert_lines]
    ordered_boxes = np.argsort(unordered_boxes)

    # Fill the table using intersection and IoU logic
    for i in range(len(horiz_lines)):
        for j in range(len(vert_lines)):
            resultant = intersection(horiz_boxes[horiz_lines[i]], vert_boxes[vert_lines[ordered_boxes[j]]])

            for b in range(len(boxes)):
                the_box = [boxes[b][0][0], boxes[b][0][1], boxes[b][2][0], boxes[b][2][1]]
                if iou(resultant, the_box) > 0.1:
                    out_array[i][j] = texts[b]

    # Convert to a structured array
    out_array = np.array(out_array)

    # Save extracted text and structure as a CSV file with the image filename
    csv_filename = f"{os.path.splitext(image_file)[0]}.csv"
    csv_output_path = os.path.join(output_folder, csv_filename)
    pd.DataFrame(out_array).to_csv(csv_output_path, index=False, header=False)

    print(f"Processing completed for {image_file}. Results saved in {output_folder}")

print("\nAll images processed successfully!")