# Installations

Installation Function to Page wise Image Conversion

In [None]:
# %pip install pymupdf

Installation Function to Cropping Tables

In [None]:
# %%capture
# %pip install "unstructured[all-docs]"

installations for Function to Recognise and format tables as csv

In [None]:
# %pip install paddlepaddle paddleocr tensorflow pandas

# Imports

In [None]:
import os
import logging
import warnings
import pandas as pd
import fitz
import cv2
from unstructured.partition.image import partition_image
import numpy as np
import tensorflow as tf
from paddleocr import PaddleOCR


  from .autonotebook import tqdm as notebook_tqdm


# Configure Warnings and Logging

In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
logging.getLogger("ppocr").setLevel(logging.ERROR)
warnings.filterwarnings('ignore')

# Directory Configuration

In [None]:
def setup_directories(base_dir):
    """Setup output directories for the pipeline"""
    directories = {
        "text": os.path.join(base_dir, "Extracted Text"),
        "tables": os.path.join(base_dir, "Extracted Tables"),
        "csv": os.path.join(base_dir, "Extracted Tables CSV"),
        "images": os.path.join(base_dir, "Extracted Images")
    }
    
    for dir_path in directories.values():
        os.makedirs(dir_path, exist_ok=True)
    
    return directories

# Helper Functions

In [None]:
def intersection(box_1, box_2):
    """Calculate intersection of two bounding boxes"""
    return [box_2[0], box_1[1], box_2[2], box_1[3]]

def iou(box_1, box_2):
    """Calculate Intersection over Union of two boxes"""
    x_1 = max(box_1[0], box_2[0])
    y_1 = max(box_1[1], box_2[1])
    x_2 = min(box_1[2], box_2[2])
    y_2 = min(box_1[3], box_2[3])

    inter = abs(max((x_2 - x_1, 0)) * max((y_2 - y_1), 0))
    if inter == 0:
        return 0

    box_1_area = abs((box_1[2] - box_1[0]) * (box_1[3] - box_1[1]))
    box_2_area = abs((box_2[2] - box_2[0]) * (box_2[3] - box_2[1]))

    return inter / float(box_1_area + box_2_area - inter)

# Core Processing Functions

In [None]:
def process_table(element, page_name, pdf_name, image_cv, directories, ocr, padding):
    """Process and extract table from document"""
    try:
        coordinates = element["metadata"]["coordinates"]["points"]
        table_folder = os.path.join(directories["tables"], f"{pdf_name}-Tables")
        csv_folder = os.path.join(directories["csv"], f"{pdf_name}-csv")
        
        os.makedirs(table_folder, exist_ok=True)
        os.makedirs(csv_folder, exist_ok=True)
        
        # Extract table coordinates and apply padding
        x_min = max(0, int(min(pt[0] for pt in coordinates)) - padding['left'])
        y_min = max(0, int(min(pt[1] for pt in coordinates)) - padding['top'])
        x_max = min(image_cv.shape[1], int(max(pt[0] for pt in coordinates)) + padding['right'])
        y_max = min(image_cv.shape[0], int(max(pt[1] for pt in coordinates)) + padding['bottom'])
        
        # Process table image and save
        cropped_table = image_cv[y_min:y_max, x_min:x_max]
        table_filename = f"{page_name}_Table_{len(os.listdir(table_folder)) + 1}.png"
        table_path = os.path.join(table_folder, table_filename)
        cv2.imwrite(table_path, cropped_table)
        
        # OCR and structure extraction
        return process_table_ocr(table_path, table_filename, csv_folder, ocr)
        
    except (KeyError, IndexError) as e:
        print(f"Error processing table: {e}")
        return None

def process_table_ocr(table_path, table_filename, csv_folder, ocr):
    """Process table with OCR and convert to structured format"""
    output = ocr.ocr(table_path)[0]
    if not output:
        print(f"No OCR output for table: {table_filename}")
        return None

    # Extract text and structure
    boxes = [line[0] for line in output]
    texts = [line[1][0] for line in output]
    probabilities = [line[1][1] for line in output]
    
    # Process and save structured data
    structured_data = extract_table_structure(boxes, texts, probabilities)
    if structured_data:
        csv_filename = f"{os.path.splitext(table_filename)[0]}.csv"
        csv_path = os.path.join(csv_folder, csv_filename)
        pd.DataFrame(structured_data).to_csv(csv_path, index=False, header=False)
        print(f"Saved CSV: {csv_filename}")
    
    return structured_data

# Main Pipeline Function

In [None]:
def process_pdf_documents(input_dir, output_base_dir="TEST RESULT7", dpi=300,
                         pad_left=5, pad_top=5, pad_right=14, pad_bottom=7):
    """
    Main pipeline function to process PDF documents
    """
    # Initialize directories and OCR
    directories = setup_directories(output_base_dir)
    ocr = PaddleOCR(lang='en')
    padding = {'left': pad_left, 'top': pad_top, 'right': pad_right, 'bottom': pad_bottom}
    
    # Process PDFs
    pdf_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.pdf')]
    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        process_single_pdf(pdf_file, input_dir, directories, ocr, padding, dpi)

    print("\nProcessing complete!")
    for dir_type, dir_path in directories.items():
        print(f"{dir_type.capitalize()} extracted to: {dir_path}")

# Example Usage

In [None]:
if __name__ == "__main__":
    input_directory = "TEST INPUT PDF"
    output_directory = "TEST_RESULT_1"
    
    # Use default padding
    process_pdf_documents(input_directory, output_directory)
    
    # Or use custom padding
    # process_pdf_documents(input_directory, output_directory,
    #                      pad_left=10, pad_top=8, pad_right=16, pad_bottom=12)

In [1]:
# 1. INSTALLATIONS
#----------------

# 1.1 PDF Processing Libraries
# %pip install pymupdf

# 1.2 Table Detection Libraries
# %pip install "unstructured[all-docs]"

# 1.3 OCR and Image Processing Libraries
# %pip install paddlepaddle paddleocr tensorflow

# 2. IMPORTS
#-----------

# 2.1 Basic Libraries
import os
import logging
import warnings

# 2.2 Data Processing Libraries
import pandas as pd
import numpy as np

# 2.3 PDF and Image Processing
import fitz
import cv2
from unstructured.partition.image import partition_image

# 2.4 Deep Learning Libraries
import tensorflow as tf
from paddleocr import PaddleOCR

# 2.5 Configure Warnings and Logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow warnings
logging.getLogger("ppocr").setLevel(logging.ERROR)  # Suppress PaddleOCR debug messages
warnings.filterwarnings('ignore')

# 3. DIRECTORY CONFIGURATION
#--------------------------
input_dir = "TEST INPUT PDF"
output_dir = "TEST_RESULT_2"

# 4. PROCESSING PARAMETERS
#------------------------
# 4.1 Default Parameters
DEFAULT_DPI = 300
DEFAULT_PADDING = {
    'left': 5,
    'top': 5,
    'right': 14,
    'bottom': 7
}

# 4.2 Custom Parameters (if needed)
pad_left = 5
pad_top = 5
pad_right = 14
pad_bottom = 7

# 5. HELPER FUNCTIONS
#-------------------
def intersection(box_1, box_2):
    """Calculate intersection of two bounding boxes"""
    return [box_2[0], box_1[1], box_2[2], box_1[3]]

def iou(box_1, box_2):
    """Calculate Intersection over Union of two boxes"""
    x_1 = max(box_1[0], box_2[0])
    y_1 = max(box_1[1], box_2[1])
    x_2 = min(box_1[2], box_2[2])
    y_2 = min(box_1[3], box_2[3])

    inter = abs(max((x_2 - x_1, 0)) * max((y_2 - y_1), 0))
    if inter == 0:
        return 0

    box_1_area = abs((box_1[2] - box_1[0]) * (box_1[3] - box_1[1]))
    box_2_area = abs((box_2[2] - box_2[0]) * (box_2[3] - box_2[1]))

    return inter / float(box_1_area + box_2_area - inter)

# 6. MAIN PROCESSING FUNCTION
#---------------------------
def process_pdf_documents_update(input_dir, output_base_dir="TEST RESULT7", dpi=300,
                               pad_left=5, pad_top=5, pad_right=14, pad_bottom=7):
    """
    Process PDFs to extract text, tables, embedded images, and create CSV files in a single pass.
    """
    # 6.1 Initialize Output Directories
    text_output_folder = os.path.join(output_base_dir, "Extracted Text")
    tables_output_folder = os.path.join(output_base_dir, "Extracted Tables")
    tables_csv_folder = os.path.join(output_base_dir, "Extracted Tables CSV")
    images_output_folder = os.path.join(output_base_dir, "Extracted Images")
    
    for folder in [text_output_folder, tables_output_folder, tables_csv_folder, images_output_folder]:
        os.makedirs(folder, exist_ok=True)

    # 6.2 Define Image Extraction Function
    def extract_embedded_images(pdf_document, page, pdf_name, page_number):
        images_folder = os.path.join(images_output_folder, f"{pdf_name}-images")
        os.makedirs(images_folder, exist_ok=True)
        
        images = page.get_images(full=True)
        print(f"  Page {page_number + 1} has {len(images)} embedded image(s).")

        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_filename = f"Page_{page_number + 1}_Image_{img_index + 1}.png"
            image_path = os.path.join(images_folder, image_filename)
            
            with open(image_path, "wb") as img_file:
                img_file.write(image_bytes)
            print(f"    Saved embedded image: {image_path}")

    # 6.3 Define Element Processing Function
    def process_element(element, page_name, pdf_name, image_cv):
        if element.get("type") == "Table":
            try:
                coordinates = element["metadata"]["coordinates"]["points"]
                table_folder = os.path.join(tables_output_folder, f"{pdf_name}-Tables")
                csv_folder = os.path.join(tables_csv_folder, f"{pdf_name}-csv")
                
                os.makedirs(table_folder, exist_ok=True)
                os.makedirs(csv_folder, exist_ok=True)
                
                # Extract and pad coordinates
                x_min = int(min(pt[0] for pt in coordinates))
                y_min = int(min(pt[1] for pt in coordinates))
                x_max = int(max(pt[0] for pt in coordinates))
                y_max = int(max(pt[1] for pt in coordinates))
                
                x_min = max(0, x_min - pad_left)
                y_min = max(0, y_min - pad_top)
                x_max = min(image_cv.shape[1], x_max + pad_right)
                y_max = min(image_cv.shape[0], y_max + pad_bottom)
                
                # Process table image
                cropped_table = image_cv[y_min:y_max, x_min:x_max]
                table_filename = f"{page_name}_Table_{len(os.listdir(table_folder)) + 1}.png"
                table_path = os.path.join(table_folder, table_filename)
                cv2.imwrite(table_path, cropped_table)
                print(f"Cropped table saved to: {table_path}")
                
                # OCR Processing
                output = ocr.ocr(table_path)[0]
                if not output:
                    print(f"No OCR output for table: {table_filename}")
                    return

                # Extract text and structure
                boxes = [line[0] for line in output]
                texts = [line[1][0] for line in output]
                probabilities = [line[1][1] for line in output]

                # Generate table structure
                image_height, image_width = cropped_table.shape[:2]
                horiz_boxes = []
                vert_boxes = []

                for box in boxes:
                    x_h, x_v = 0, int(box[0][0])
                    y_h, y_v = int(box[0][1]), 0
                    width_h, width_v = image_width, int(box[2][0] - box[0][0])
                    height_h, height_v = int(box[2][1] - box[0][1]), image_height

                    horiz_boxes.append([x_h, y_h, x_h + width_h, y_h + height_h])
                    vert_boxes.append([x_v, y_v, x_v + width_v, y_v + height_v])

                # Apply NMS
                horiz_out = tf.image.non_max_suppression(
                    horiz_boxes, probabilities, max_output_size=1000, iou_threshold=0.1
                )
                vert_out = tf.image.non_max_suppression(
                    vert_boxes, probabilities, max_output_size=1000, iou_threshold=0.1
                )

                horiz_lines = np.sort(np.array(horiz_out))
                vert_lines = np.sort(np.array(vert_out))

                # Create and fill table
                out_array = [["" for _ in range(len(vert_lines))] for _ in range(len(horiz_lines))]
                unordered_boxes = [vert_boxes[i][0] for i in vert_lines]
                ordered_boxes = np.argsort(unordered_boxes)

                for i in range(len(horiz_lines)):
                    for j in range(len(vert_lines)):
                        resultant = intersection(
                            horiz_boxes[horiz_lines[i]], 
                            vert_boxes[vert_lines[ordered_boxes[j]]]
                        )

                        for b in range(len(boxes)):
                            the_box = [boxes[b][0][0], boxes[b][0][1], boxes[b][2][0], boxes[b][2][1]]
                            if iou(resultant, the_box) > 0.1:
                                out_array[i][j] = texts[b]

                # Save CSV
                csv_filename = f"{os.path.splitext(table_filename)[0]}.csv"
                csv_path = os.path.join(csv_folder, csv_filename)
                pd.DataFrame(out_array).to_csv(csv_path, index=False, header=False)
                print(f"Saved CSV: {csv_filename} in {csv_folder}")
                
            except (KeyError, IndexError) as e:
                print(f"Error processing table: {e}")
                
        elif element.get("type") != "Table":
            # Process text elements
            text_content = element.get("text", "")
            if text_content:
                text_folder = os.path.join(text_output_folder, f"{pdf_name}-Texts")
                os.makedirs(text_folder, exist_ok=True)
                text_filename = f"{page_name}_text.txt"
                text_path = os.path.join(text_folder, text_filename)
                
                with open(text_path, "a", encoding="utf-8") as text_file:
                    text_file.write(text_content + "\n")

    # 6.4 Initialize OCR
    ocr = PaddleOCR(lang='en')
    
    # 6.5 Process PDFs
    pdf_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.pdf')]
    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        pdf_path = os.path.join(input_dir, pdf_file)
        pdf_name = os.path.splitext(pdf_file)[0]
        print(f"\nProcessing PDF: {pdf_file}")
        
        pdf_document = fitz.open(pdf_path)
        
        for page_number in range(pdf_document.page_count):
            page = pdf_document[page_number]
            page_name = f'page_{page_number + 1}'
            
            # Extract images
            extract_embedded_images(pdf_document, page, pdf_name, page_number)
            
            # Convert page to image
            pix = page.get_pixmap(dpi=dpi, colorspace=fitz.csRGB)
            img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
            image_cv = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
            
            # Process page
            temp_image_path = os.path.join(output_base_dir, "temp_page.png")
            cv2.imwrite(temp_image_path, image_cv)
            
            try:
                elements = partition_image(filename=temp_image_path, 
                                        infer_table_structure=True, 
                                        strategy='hi_res')
                element_dict = [el.to_dict() for el in elements]
                
                for element in element_dict:
                    process_element(element, page_name, pdf_name, image_cv)
                    
            finally:
                if os.path.exists(temp_image_path):
                    os.remove(temp_image_path)
            
            print(f"Processed page {page_number + 1}/{pdf_document.page_count}")
        
        pdf_document.close()
        print(f"Finished processing PDF: {pdf_file}")

    print("\nProcessing complete!")
    print(f"Text extracted to: {text_output_folder}")
    print(f"Tables extracted to: {tables_output_folder}")
    print(f"Table CSVs saved to: {tables_csv_folder}")
    print(f"Embedded images extracted to: {images_output_folder}")

# 7. EXECUTION
#------------
# 7.1 Process with Default Settings
process_pdf_documents_update(input_dir, output_dir)

# 7.2 Process with Custom Settings (commented out)
# process_pdf_documents_update(input_dir, output_dir, 
#                            pad_left=pad_left,
#                            pad_top=pad_top, 
#                            pad_right=pad_right,
#                            pad_bottom=pad_bottom)

# 8. RESULTS VERIFICATION (Optional)
#---------------------------------
# 8.1 Load and Display Sample Results
# df1 = pd.read_csv('PDF1 Tables/budget_speech-Pages-Cropped/page_40_Table_1.csv')
# df1

# 8.2 Load Another Sample
# df2 = pd.read_csv('PDF1 Tables/budget_speech-Pages-Cropped/page_31_Table_1.csv')
# df2

  from .autonotebook import tqdm as notebook_tqdm


Found 1 PDF files to process

Processing PDF: 2501.00663v1.pdf
  Page 1 has 0 embedded image(s).
Processed page 1/27
  Page 2 has 0 embedded image(s).
Processed page 2/27
  Page 3 has 0 embedded image(s).
Processed page 3/27
  Page 4 has 0 embedded image(s).
Processed page 4/27
  Page 5 has 0 embedded image(s).
Processed page 5/27
  Page 6 has 0 embedded image(s).
Processed page 6/27
  Page 7 has 1 embedded image(s).
    Saved embedded image: TEST_RESULT_2\Extracted Images\2501.00663v1-images\Page_7_Image_1.png
Processed page 7/27
  Page 8 has 1 embedded image(s).
    Saved embedded image: TEST_RESULT_2\Extracted Images\2501.00663v1-images\Page_8_Image_1.png
Processed page 8/27
  Page 9 has 2 embedded image(s).
    Saved embedded image: TEST_RESULT_2\Extracted Images\2501.00663v1-images\Page_9_Image_1.png
    Saved embedded image: TEST_RESULT_2\Extracted Images\2501.00663v1-images\Page_9_Image_2.png
Processed page 9/27
  Page 10 has 1 embedded image(s).
    Saved embedded image: TEST_R