In [102]:
# %%capture
# %pip install "unstructured[all-docs]"

In [None]:
from IPython.display import JSON

import json

from unstructured.partition.html import partition_html
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import dict_to_elements, elements_to_json

In [2]:
filename = "budget_speech.pdf"

In [117]:
poppler_path = r"C:/Users/Hemant.Singhsidar/Downloads/Release-24.08.0-0/poppler-24.08.0/Library/bin"

In [3]:
from unstructured.partition.pdf import partition_pdf

elements = partition_pdf(filename=filename,
                         infer_table_structure=True,
                         strategy='hi_res',
           )

In [None]:
len(elements)

In [None]:

element_dict = [el.to_dict() for el in elements]
output = json.dumps(element_dict, indent=2)
print(output)

unique_types = set()

for item in element_dict:
    unique_types.add(item['type'])

print(unique_types)

In [None]:
import json

def get_page_and_coordinates_by_type(output_str, element_type):
    """
    Filter elements by type and return their page numbers and coordinates.
    
    Args:
        output_str (str): JSON string containing document elements
        element_type (str): Type of element to filter (e.g., 'Title', 'Image', etc.)
    
    Returns:
        list: List of dictionaries containing page numbers and coordinates for matched elements
    """
    # Parse JSON string into Python object if it's a string
    if isinstance(output_str, str):
        output = json.loads(output_str)
    else:
        output = output_str
        
    filtered_data = []
    
    for item in output:
        if item['type'] == element_type:
            filtered_data.append({
                'page_number': item['metadata']['page_number'],
                'coordinates': item['metadata']['coordinates']['points'],
                'text': item['text']  # Including text for reference
            })
    
    # Print total count of elements found
    print(f"\nTotal {element_type}s found: {len(filtered_data)}")
    return filtered_data

# Example usage:
# Get all Tables
tables = get_page_and_coordinates_by_type(output, 'Table')
for table in tables:
    print(f"\nPage: {table['page_number']}")  # Fixed variable name from 'title' to 'table'
    print(f"Text: {table['text']}")
    print(f"Coordinates: {table['coordinates']}")

# Get all Images (uncomment to use)
# images = get_page_and_coordinates_by_type(output, 'Image')
# for image in images:
#     print(f"\nPage: {image['page_number']}")
#     print(f"Coordinates: {image['coordinates']}")

In [None]:
tables = [el for el in elements if el.category == "Table"]

print(tables[0].text)
print(tables[0].metadata.text_as_html)

In [None]:
tables

In [None]:
len(tables)

In [None]:
tables[0].text

In [None]:
tables[0].metadata

### Now, comes the most interesting part ( utilizing the extracted data in most efficient way)

- It's helpful to have an HTML representation of the table so that you can the information to an LLM while maintaining the table structure.

In [24]:
table_html = tables[0].metadata.text_as_html

In [None]:
table_html

In [128]:
# # view what the HTML in the metadata field looks like

# from io import StringIO 
# from lxml import etree

# parser = etree.XMLParser(remove_blank_text=True)
# file_obj = StringIO(table_html)
# tree = etree.parse(file_obj, parser)
# print(etree.tostring(tree, pretty_print=True).decode())

In [129]:
# # let's display this table

# from IPython.core.display import HTML
# HTML(table_html)

#### Convert to pandas df

In [136]:
# %pip install pandas

In [None]:
import pandas as pd

# Convert HTML table to pandas DataFrame
dfs = pd.read_html(table_html)

In [None]:
dfs

In [None]:

# Assuming there's only one table, get the DataFrame
df = dfs[0]

# Now you have the DataFrame
print(df)


In [None]:
df.shape

In [None]:
df.head()

In [38]:
import os
import json
from unstructured.partition.image import partition_image

def extract_text_from_images(input_folder, pdf_name):
    # Define the output directory
    text_output_folder = os.path.join("RESULTS", "PAGE_TEXT", f"{pdf_name}-texts")

    # Ensure the output folder exists
    os.makedirs(text_output_folder, exist_ok=True)

    # Get all image files in the input folder
    image_files = [f for f in os.listdir(input_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

    # Process each image file
    for image_file in image_files:
        image_path = os.path.join(input_folder, image_file)

        # Define `image_name` at the start of the loop
        image_name = os.path.splitext(image_file)[0]

        # Perform table detection using Unstructured's partition_image
        elements = partition_image(filename=image_path, infer_table_structure=True, strategy='hi_res')

        # Convert elements to JSON structure
        element_dict = [el.to_dict() for el in elements]

        # Extract text if type is not "Table"
        for item in element_dict:
            if isinstance(item, dict) and item.get("type") != "Table":
                text_content = item.get("text", "")
                if text_content:
                    text_filename = f"{image_name}_text.txt"
                    text_output_path = os.path.join(text_output_folder, text_filename)
                    with open(text_output_path, "a", encoding="utf-8") as text_file:
                        text_file.write(text_content + "\n")
                    print(f"Text extracted and saved to: {text_output_path}")

    print(f"\nAll images processed successfully! Text saved in '{text_output_folder}'.")

# Example usage
input_folder = "Pdf1 Pages"
pdf_name = "PDF1"
extract_text_from_images(input_folder, pdf_name)

Text extracted and saved to: RESULTS\PAGE_TEXT\PDF1-texts\page_1_text.txt
Text extracted and saved to: RESULTS\PAGE_TEXT\PDF1-texts\page_1_text.txt
Text extracted and saved to: RESULTS\PAGE_TEXT\PDF1-texts\page_1_text.txt
Text extracted and saved to: RESULTS\PAGE_TEXT\PDF1-texts\page_1_text.txt
Text extracted and saved to: RESULTS\PAGE_TEXT\PDF1-texts\page_1_text.txt
Text extracted and saved to: RESULTS\PAGE_TEXT\PDF1-texts\page_1_text.txt
Text extracted and saved to: RESULTS\PAGE_TEXT\PDF1-texts\page_1_text.txt
Text extracted and saved to: RESULTS\PAGE_TEXT\PDF1-texts\page_1_text.txt
Text extracted and saved to: RESULTS\PAGE_TEXT\PDF1-texts\page_1_text.txt
Text extracted and saved to: RESULTS\PAGE_TEXT\PDF1-texts\page_1_text.txt
Text extracted and saved to: RESULTS\PAGE_TEXT\PDF1-texts\page_1_text.txt
Text extracted and saved to: RESULTS\PAGE_TEXT\PDF1-texts\page_1_text.txt
Text extracted and saved to: RESULTS\PAGE_TEXT\PDF1-texts\page_1_text.txt
Text extracted and saved to: RESULTS\P