<a href="https://colab.research.google.com/github/Mruthunjay/Data/blob/main/PDF_Extraction_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import fitz  # PyMuPDF
import tabula
import pandas as pd
from collections import Counter
import os
import easygui


def get_most_common_color(pix):
    """
    Extracts the most common color from a pixmap.

    Args:
        pix (fitz.Pixmap): The pixmap object containing image data.

    Returns:
        tuple: The most common color in RGB format.
    """
    colors = list(pix.colors)  # Extract all colors from the pixmap
    color_counter = Counter(colors)  # Count the frequency of each color
    return color_counter.most_common(1)[0][0]  # Return the most common color


def extract_background_colors(pdf_path):
    """
    Extracts the most common (background) color from each page of a PDF.

    Args:
        pdf_path (str): The file path to the PDF document.

    Returns:
        list: A list of tuples containing page numbers and their most common color.
    """
    doc = fitz.open(pdf_path)
    page_colors = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap()
        common_color = get_most_common_color(pix)
        page_colors.append((page_num + 1, common_color))

    doc.close()
    return page_colors


def extract_shapes(pdf_path):
    """
    Extracts graphical shapes from each page of a PDF.

    Args:
        pdf_path (str): The file path to the PDF document.

    Returns:
        list: A list of tuples containing page numbers and shape details.
    """
    doc = fitz.open(pdf_path)
    shapes_info = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        shapes = [shape for shape in page.get_drawings()]  # Collect all shapes
        shapes_info.append((page_num + 1, shapes))

    doc.close()
    return shapes_info


def extract_tables(pdf_path):
    """
    Extracts tables from each page of a PDF.

    Args:
        pdf_path (str): The file path to the PDF document.

    Returns:
        list: A list of tuples containing page numbers and table data.
    """
    # Extract tables using tabula; handle multiple tables per page
    tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
    tables_info = [(index + 1, table) for index, table in enumerate(tables)]
    return tables_info


def process_pdfs_in_folder(folder_path):
    """
    Processes all PDF files in a specified folder, extracting background colors,
    shapes, and tables. The extracted information is saved in an Excel file.

    Args:
        folder_path (str): The path to the folder containing PDF files.
    """
    bg_data = []  # List to hold background color data
    shapes_data = []  # List to hold shapes data
    tables_data = []  # List to hold tables data

    # Iterate through all files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, file_name)

            # Extract background colors
            bg_info = extract_background_colors(pdf_path)
            for page_num, color in bg_info:
                bg_data.append([file_name, page_num, color])

            # Extract shapes
            shapes_info = extract_shapes(pdf_path)
            for page_num, shapes in shapes_info:
                for shape in shapes:
                    shapes_data.append([file_name, page_num, shape['type'], shape['items']])

            # Extract tables
            tables_info = extract_tables(pdf_path)
            for page_num, table in tables_info:
                table_str = table.to_string(index=False)  # Convert DataFrame to string
                tables_data.append([file_name, page_num, table_str])

    # Save extracted data to Excel file
    with pd.ExcelWriter('pdf_extracted_data.xlsx') as writer:
        pd.DataFrame(bg_data, columns=['Document', 'Page', 'Color']).to_excel(writer, sheet_name='BG', index=False)
        pd.DataFrame(shapes_data, columns=['Document', 'Page', 'Shape Type', 'Shape Items']).to_excel(writer, sheet_name='Shape', index=False)
        pd.DataFrame(tables_data, columns=['Document', 'Page', 'Table']).to_excel(writer, sheet_name='Tables', index=False)


if __name__ == "__main__":
    # Open a dialog to select the folder containing PDF files
    folder_path = easygui.diropenbox(msg="Select the folder containing PDF files")
    if folder_path:
        process_pdfs_in_folder(folder_path)  # Process each PDF in the selected folder

# New section