In [1]:
# %pip install -r crop_requirments.txt

In [2]:
# poppler_path = r"C:/Users/Hemant.Singhsidar/Downloads/Release-24.08.0-0/poppler-24.08.0/Library/bin"

In [None]:
import os
import json
import cv2
from unstructured.partition.image import partition_image

# Define the input and output directories
input_folder = "Pdf1 Pages"
output_folder = "PDF1 Cropped Tables"

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Function to crop an image with padding and save it
def crop_with_padding(image_path, coordinates, output_folder, filename, top_left_padding, bottom_right_padding):
   
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error loading image: {image_path}")
        return

    x_min = int(min(pt[0] for pt in coordinates))
    y_min = int(min(pt[1] for pt in coordinates))
    x_max = int(max(pt[0] for pt in coordinates))
    y_max = int(max(pt[1] for pt in coordinates))

    # Apply top-left and bottom-right padding
    x_min_padded = x_min - top_left_padding
    y_min_padded = y_min - top_left_padding
    x_max_padded = x_max + bottom_right_padding * 2
    y_max_padded = y_max + bottom_right_padding

    # Ensure padded values stay within the image boundaries
    x_min_padded = max(0, x_min_padded)
    y_min_padded = max(0, y_min_padded)
    x_max_padded = min(image.shape[1], x_max_padded)
    y_max_padded = min(image.shape[0], y_max_padded)

    # Crop the image
    cropped_image = image[y_min_padded:y_max_padded, x_min_padded:x_max_padded]

    os.makedirs(output_folder, exist_ok=True)
    output_path = os.path.join(output_folder, filename)
    cv2.imwrite(output_path, cropped_image)
    print(f"Cropped image saved to: {output_path}")

# Dictionary to count tables per image
table_counter = {}

# Get all image files in the input folder
image_files = [f for f in os.listdir(input_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

# Process each image file
for image_file in image_files:
    image_path = os.path.join(input_folder, image_file)

    # Define `image_name` at the start of the loop
    image_name = os.path.splitext(image_file)[0]

    # Perform table detection using Unstructured's partition_image
    elements = partition_image(filename=image_path, infer_table_structure=True, strategy='hi_res')

    # Convert elements to JSON structure
    element_dict = [el.to_dict() for el in elements]

    # Extract and crop tables
    for item in element_dict:
        if isinstance(item, dict) and item.get("type") == "Table":
            try:
                coordinates = item["metadata"]["coordinates"]["points"]

                # Count tables per image
                if image_name not in table_counter:
                    table_counter[image_name] = 1
                else:
                    table_counter[image_name] += 1
                table_number = table_counter[image_name]

                # Set output filename for cropped table image
                filename = f"{image_name}_Table_{table_number}.png"

                # Crop and save the table
                crop_with_padding(image_path, coordinates, output_folder, filename, top_left_padding=5, bottom_right_padding=7)

            except KeyError as e:
                print(f"Missing key {e} in item: {item}")

print("\nAll images processed successfully! Cropped tables saved in 'Cropped Tables'.")


  from .autonotebook import tqdm as notebook_tqdm


Cropped image saved to: PDF1 Cropped Tables\page_1_Table_1.png
Cropped image saved to: PDF1 Cropped Tables\page_2_Table_1.png
Cropped image saved to: PDF1 Cropped Tables\page_2_Table_2.png
Cropped image saved to: PDF1 Cropped Tables\page_2_Table_3.png
Cropped image saved to: PDF1 Cropped Tables\page_2_Table_4.png
Cropped image saved to: PDF1 Cropped Tables\page_3_Table_1.png
Cropped image saved to: PDF1 Cropped Tables\page_4_Table_1.png
Cropped image saved to: PDF1 Cropped Tables\page_4_Table_2.png
Cropped image saved to: PDF1 Cropped Tables\page_5_Table_1.png
Cropped image saved to: PDF1 Cropped Tables\page_5_Table_2.png
Cropped image saved to: PDF1 Cropped Tables\page_6_Table_1.png
Cropped image saved to: PDF1 Cropped Tables\page_6_Table_2.png
Cropped image saved to: PDF1 Cropped Tables\page_7_Table_1.png
Cropped image saved to: PDF1 Cropped Tables\page_7_Table_2.png
Cropped image saved to: PDF1 Cropped Tables\page_8_Table_1.png

All images processed successfully! Cropped tables save

In [None]:
import os
import json
import cv2
from unstructured.partition.image import partition_image

# Define the input and output directories
input_folder = "Pdf1 Pages"
output_folder = "PDF1 Cropped Tables"
text_output_folder = "PDF1 Extracted Text"

# Ensure the output folders exist
os.makedirs(output_folder, exist_ok=True)
os.makedirs(text_output_folder, exist_ok=True)

# Function to crop an image with padding and save it
def crop_with_padding(image_path, coordinates, output_folder, filename, top_left_padding, bottom_right_padding):
   
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error loading image: {image_path}")
        return

    x_min = int(min(pt[0] for pt in coordinates))
    y_min = int(min(pt[1] for pt in coordinates))
    x_max = int(max(pt[0] for pt in coordinates))
    y_max = int(max(pt[1] for pt in coordinates))

    # Apply top-left and bottom-right padding
    x_min_padded = x_min - top_left_padding
    y_min_padded = y_min - top_left_padding
    x_max_padded = x_max + bottom_right_padding * 2
    y_max_padded = y_max + bottom_right_padding

    # Ensure padded values stay within the image boundaries
    x_min_padded = max(0, x_min_padded)
    y_min_padded = max(0, y_min_padded)
    x_max_padded = min(image.shape[1], x_max_padded)
    y_max_padded = min(image.shape[0], y_max_padded)

    # Crop the image
    cropped_image = image[y_min_padded:y_max_padded, x_min_padded:x_max_padded]

    os.makedirs(output_folder, exist_ok=True)
    output_path = os.path.join(output_folder, filename)
    cv2.imwrite(output_path, cropped_image)
    print(f"Cropped image saved to: {output_path}")

# Dictionary to count tables per image
table_counter = {}

# Get all image files in the input folder
image_files = [f for f in os.listdir(input_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

# Process each image file
for image_file in image_files:
    image_path = os.path.join(input_folder, image_file)

    # Define `image_name` at the start of the loop
    image_name = os.path.splitext(image_file)[0]

    # Perform table detection using Unstructured's partition_image
    elements = partition_image(filename=image_path, infer_table_structure=True, strategy='hi_res')

    # Convert elements to JSON structure
    element_dict = [el.to_dict() for el in elements]

    # Extract and crop tables or extract text
    for item in element_dict:
        if isinstance(item, dict):
            if item.get("type") == "Table":
                try:
                    coordinates = item["metadata"]["coordinates"]["points"]

                    # Count tables per image
                    if image_name not in table_counter:
                        table_counter[image_name] = 1
                    else:
                        table_counter[image_name] += 1
                    table_number = table_counter[image_name]

                    # Set output filename for cropped table image
                    filename = f"{image_name}_Table_{table_number}.png"

                    # Crop and save the table
                    crop_with_padding(image_path, coordinates, output_folder, filename, top_left_padding=5, bottom_right_padding=7)

                except KeyError as e:
                    print(f"Missing key {e} in item: {item}")
            else:
                # Extract text if type is not "Table"
                text_content = item.get("text", "")
                if text_content:
                    text_filename = f"{image_name}_text.txt"
                    text_output_path = os.path.join(text_output_folder, text_filename)
                    with open(text_output_path, "a", encoding="utf-8") as text_file:
                        text_file.write(text_content + "\n")
                    print(f"Text extracted and saved to: {text_output_path}")

print("\nAll images processed successfully! Cropped tables saved in 'PDF1 Cropped Tables' and text saved in 'PDF1 Extracted Text'.")