In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# Configuration
input_file = "data/sample.pdf"  # Replace with a file of your own
batch_size = 10  # Maximum available value is 100

# Input parameters
json_file = "data/sample_0_9.json"
output_prefix = "images/sample_0_9_cropped"

In [3]:
import os
import fitz
 
def split_pdf(input_file, batch_size):
    # Open input_pdf
    input_pdf = fitz.open(input_file)
    num_pages = len(input_pdf)
    print(f"Total number of pages: {num_pages}")
 
    # Split input_pdf
    for start_page in range(0, num_pages, batch_size):
        end_page = min(start_page + batch_size, num_pages) - 1
 
        # Write output_pdf to file
        input_file_basename = os.path.splitext(input_file)[0]
        output_file = f"{input_file_basename}_{start_page}_{end_page}.pdf"
        print(output_file)
        with fitz.open() as output_pdf:
            output_pdf.insert_pdf(input_pdf, from_page=start_page, to_page=end_page)
            output_pdf.save(output_file)
 
    # Close input_pdf
    input_pdf.close()
 
split_pdf(input_file, batch_size)

Total number of pages: 24
data/sample_0_9.pdf
data/sample_10_19.pdf
data/sample_20_23.pdf


In [4]:
from glob import glob
import json
import os
import requests
 
API_KEY = os.environ.get("UPSTAGE_API_KEY")
 
def call_document_parse(input_file, output_file):
    # Send request
    response = requests.post(
        "https://api.upstage.ai/v1/document-digitization",
        headers={"Authorization": f"Bearer {API_KEY}"},
        data={"base64_encoding": "['figure', 'chart', 'table']", "model": "document-parse"}, # base64 encoding for cropped image of the figure category.
        files={"document": open(input_file, "rb")})
 
    # Save response
    if response.status_code == 200:
        with open(output_file, "w") as f:
            json.dump(response.json(), f, ensure_ascii=False)
    else:
        raise ValueError(f"Unexpected status code {response.status_code}.")
 
# Find all shorter PDFs related to input_file
short_input_files = glob(os.path.splitext(input_file)[0] + "_*.pdf")
 
# Send request and save response for all shorter PDFs
for short_input_file in short_input_files:
    print(short_input_file)
    short_output_file = os.path.splitext(short_input_file)[0] + ".json"
    call_document_parse(short_input_file, short_output_file)

data/sample_0_9.pdf
data/sample_20_23.pdf
data/sample_10_19.pdf


In [None]:
from langchain_core.documents import Document
from markdownify import markdownify as md
from bs4 import BeautifulSoup
import base64

In [5]:
with open(json_file, "r") as f:
    data = json.load(f)

In [34]:
unique_categories = tuple(sorted(set(element.get("category") for element in data["elements"] if "category" in element)))

In [35]:
unique_categories

('chart',
 'footer',
 'header',
 'heading1',
 'index',
 'list',
 'paragraph',
 'table')

In [None]:
docs = []
for element in data['elements']:
    metadata = {
        "id": element.get("id"),
        "page": element.get("page"),
        "category": element.get("category"),
        "html": element.get("content", {}).get("html"),
        "base64_encoding": element.get("base64_encoding", None),
        "image_id": [],
        "image_path": [],
             
    }
    docs.append(Document(page_content="", metadata=metadata))


In [147]:
docs[0]

Document(metadata={'id': 0, 'page': 1, 'category': 'header', 'html': "<header id='0' style='font-size:18px'>OECD</header>", 'base64_encoding': None, 'image_id': [], 'image_path': []}, page_content='')

In [149]:
i = 0
for doc in docs:
    # print(doc.page_content)
    if doc.metadata["category"] == "figure" or doc.metadata["category"] == "chart" or doc.metadata["category"] == "table":
        output_file = f"{output_prefix}_{doc.metadata['category']}_{i}.png"
        
        soup = BeautifulSoup(doc.metadata['html'], 'html.parser')
        if doc.metadata['category'] == 'figure':
            soup.find('img')['src'] = output_file
            replaced_html = str(soup)
            image_path = output_file
            doc.metadata['html'] = replaced_html
            
        elif doc.metadata['category'] == 'chart':
            soup.find('img')['src'] = output_file
            replaced_html = str(soup)
            image_path = output_file
            doc.metadata['html'] = replaced_html
        
        elif doc.metadata['category'] == 'table':
            img = soup.new_tag("img", src=output_file)
            soup.insert(0, img)
            replaced_html = str(soup)
            image_path = output_file
            doc.metadata['html'] = replaced_html
                    
        doc.metadata['image_id'].append(doc.metadata['id'])
        doc.metadata['image_path'].append(image_path)
        
        with open (output_file, 'wb') as fh:
            fh.write(base64.decodebytes(str.encode(doc.metadata["base64_encoding"])))
        i+=1
    doc.page_content = md(doc.metadata['html'])

In [150]:
docs[46].metadata['image_id']

[46]

In [None]:
tests = docs.copy()

In [154]:
merged = {}
for test in tests:
    if test.metadata['image_path'] is not []:
        bucket = merged.setdefault(test.metadata['page'], test.model_copy())  # or clone
        bucket.page_content += "\n\n" + test.page_content
        bucket.metadata['image_id'].extend(test.metadata['image_id'])
        bucket.metadata['image_path'].extend(test.metadata['image_path'])

    else:
        bucket = merged.setdefault(test.metadata['page'], test.model_copy())  # or clone
        bucket.page_content += "\n\n" + test.page_content
objects = list(merged.values())

In [168]:
objects[8].metadata

{'id': 68,
 'page': 10,
 'category': 'header',
 'html': "<header id='68' style='font-size:18px'>8 I</header>",
 'base64_encoding': None,
 'image_id': [70, 75, 77],
 'image_path': ['images/sample_0_9_cropped_chart_3.png',
  'images/sample_0_9_cropped_chart_4.png',
  'images/sample_0_9_cropped_chart_5.png']}

In [169]:
# remove data from metadata. we don't need that anymore.
for object in objects:
    del object.metadata['base64_encoding']
    del object.metadata['html']
    del object.metadata['category']
    del object.metadata['id']

In [170]:
objects

 Document(metadata={'page': 4, 'image_id': [], 'image_path': []}, page_content='This work is published under the responsibility of the Secretary-General of the OECD. The opinions expressed and  \narguments employed herein do not necessarily reflect the official views of the Member countries of the OECD.\n\nThis work is published under the responsibility of the Secretary-General of the OECD. The opinions expressed and  \narguments employed herein do not necessarily reflect the official views of the Member countries of the OECD.\n\nThis document, as well as any data and map included herein, are without prejudice to the status of or sovereignty over  \nany territory, to the delimitation of international frontiers and boundaries and to the name of any territory, city or area.\n\nThe statistical data for Israel are supplied by and under the responsibility of the relevant Israeli authorities. The use of  \nsuch data by the OECD is without prejudice to the status of the Golan Heights, East Je

In [171]:
import pickle

with open('results/docs.pkl', 'wb') as f:
    pickle.dump(objects, f)