In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
!wget https://www.oecd.org/content/dam/oecd/en/publications/reports/2025/09/oecd-economic-outlook-interim-report-september-2025_ae3d418b/67b10c01-en.pdf -O data/sample.pdf

--2025-11-12 13:52:21--  https://www.oecd.org/content/dam/oecd/en/publications/reports/2025/09/oecd-economic-outlook-interim-report-september-2025_ae3d418b/67b10c01-en.pdf
Resolving www.oecd.org (www.oecd.org)... 104.18.1.146, 104.18.0.146, 2606:4700::6812:192, ...
Connecting to www.oecd.org (www.oecd.org)|104.18.1.146|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2152981 (2.1M) [application/pdf]
Saving to: ‘data/sample.pdf’


2025-11-12 13:52:21 (11.6 MB/s) - ‘data/sample.pdf’ saved [2152981/2152981]



In [4]:
# Configuration
input_file = "data/sample.pdf"  # Replace with a file of your own
batch_size = 10  # Maximum available value is 100

# Input parameters

output_prefix = "/images/sample_cropped"

In [5]:
import os
import fitz
 
def split_pdf(input_file, batch_size):
    # Open input_pdf
    input_pdf = fitz.open(input_file)
    num_pages = len(input_pdf)
    print(f"Total number of pages: {num_pages}")
 
    # Split input_pdf
    for start_page in range(0, num_pages, batch_size):
        end_page = min(start_page + batch_size, num_pages) - 1
 
        # Write output_pdf to file
        input_file_basename = os.path.splitext(input_file)[0]
        output_file = f"{input_file_basename}_{start_page}_{end_page}.pdf"
        print(output_file)
        with fitz.open() as output_pdf:
            output_pdf.insert_pdf(input_pdf, from_page=start_page, to_page=end_page)
            output_pdf.save(output_file)
 
    # Close input_pdf
    input_pdf.close()
 
split_pdf(input_file, batch_size)

Total number of pages: 24
data/sample_0_9.pdf
data/sample_10_19.pdf
data/sample_20_23.pdf


In [6]:
from glob import glob
import json
import os
import requests
 
API_KEY = os.environ.get("UPSTAGE_API_KEY")
 
def call_document_parse(input_file, output_file):
    # Send request
    response = requests.post(
        "https://api.upstage.ai/v1/document-digitization",
        headers={"Authorization": f"Bearer {API_KEY}"},
        data={"base64_encoding": "['figure', 'chart', 'table']", "model": "document-parse"}, # base64 encoding for cropped image of the figure category.
        files={"document": open(input_file, "rb")})
 
    # Save response
    if response.status_code == 200:
        with open(output_file, "w") as f:
            json.dump(response.json(), f, ensure_ascii=False)
    else:
        raise ValueError(f"Unexpected status code {response.status_code}.")
 
# Find all shorter PDFs related to input_file
short_input_files = glob(os.path.splitext(input_file)[0] + "_*.pdf")
 
# Send request and save response for all shorter PDFs
for short_input_file in short_input_files:
    print(short_input_file)
    short_output_file = os.path.splitext(short_input_file)[0] + ".json"
    call_document_parse(short_input_file, short_output_file)

data/sample_0_9.pdf
data/sample_20_23.pdf
data/sample_10_19.pdf


In [7]:
json_data_arr = []
for short_input_file in sorted(short_input_files):
    short_output_file = os.path.splitext(short_input_file)[0] + ".json"
    print(short_output_file)
    with open(short_output_file, "r") as f:
        json_data_arr.append(json.load(f))

data/sample_0_9.json
data/sample_10_19.json
data/sample_20_23.json


In [8]:
len(json_data_arr)

3

In [9]:
from langchain_core.documents import Document
from markdownify import markdownify as md
from bs4 import BeautifulSoup
import base64

In [10]:
last_id, last_page = None, None

for data in json_data_arr:
    
    for idx, element in enumerate(data['elements']):
        
        if last_id is not None and last_page is not None:
            start_id = last_id + 1 # id는 0부터 시작하기 때문에 다음 시작 아이디는 1을 더하고 시작
            element['id'] = start_id + element['id'] 
            element['page'] = last_page + element['page']

        if idx == len(data['elements']) - 1:
            last_id = element['id']
            last_page = element['page']
           
            


In [11]:
# 유효성 검사
for idx1, data in enumerate(json_data_arr):
    for idx2, element in enumerate(data['elements']):
        if idx2 == 0:
            print("start id", element['id'])
            print("start page", element['page'],"\n")
        if idx2 == len(data['elements']) - 1:
            print("end id", element['id'])  
            print("end page", element['page'],"\n")

start id 0
start page 1 

end id 79
end page 10 

start id 80
start page 11 

end id 165
end page 20 

start id 166
start page 21 

end id 199
end page 24 



In [12]:
for idx1, data in enumerate(json_data_arr):
    for idx2, element in enumerate(data['elements']):
        print(element['id'], element['page'])
              

0 1
1 1
2 1
3 1
4 3
5 3
6 3
7 3
8 4
9 4
10 4
11 4
12 4
13 4
14 4
15 4
16 4
17 4
18 4
19 4
20 4
21 4
22 4
23 4
24 4
25 4
26 4
27 4
28 4
29 4
30 4
31 4
32 4
33 4
34 5
35 5
36 5
37 5
38 6
39 6
40 6
41 6
42 6
43 7
44 7
45 7
46 7
47 7
48 7
49 7
50 7
51 8
52 8
53 8
54 8
55 8
56 8
57 8
58 9
59 9
60 9
61 9
62 9
63 9
64 9
65 9
66 9
67 9
68 10
69 10
70 10
71 10
72 10
73 10
74 10
75 10
76 10
77 10
78 10
79 10
80 11
81 11
82 11
83 11
84 11
85 11
86 11
87 11
88 12
89 12
90 12
91 12
92 12
93 12
94 13
95 13
96 13
97 13
98 13
99 13
100 13
101 13
102 13
103 13
104 14
105 14
106 14
107 14
108 14
109 14
110 14
111 14
112 14
113 14
114 14
115 15
116 15
117 15
118 15
119 15
120 15
121 15
122 15
123 15
124 15
125 16
126 16
127 16
128 16
129 16
130 16
131 16
132 16
133 16
134 17
135 17
136 17
137 17
138 17
139 17
140 18
141 18
142 18
143 18
144 18
145 18
146 18
147 18
148 18
149 19
150 19
151 19
152 19
153 19
154 19
155 19
156 19
157 20
158 20
159 20
160 20
161 20
162 20
163 20
164 20
165 20
166 21
167 21
16

In [13]:
docs = []
for data in json_data_arr:
    doc = []   
    for element in data['elements']:        
        metadata = {
            "id": element.get("id"),
            "page": element.get("page"),
            "category": element.get("category"),
            "html": element.get("content", {}).get("html"),
            "base64_encoding": element.get("base64_encoding", None),
            "image_id": [],
            "image_path": [],
            "text_summary": [],
            "image_summary": []                
        }
        doc.append(Document(page_content="", metadata=metadata))
    docs.extend(doc)


In [14]:
categories = set()
for doc in docs:
    category = doc.metadata.get('category')
    if category is not None:
        categories.add(category)
print(categories)


{'index', 'figure', 'list', 'chart', 'header', 'heading1', 'footer', 'paragraph', 'table'}


In [15]:
for doc in docs:
    if doc.metadata['category'] == 'list':
        print(doc)

page_content='' metadata={'id': 41, 'page': 6, 'category': 'list', 'html': "<p id='41' data-category='list' style='font-size:18px'>· Global growth was more resilient than anticipated in the first half of 2025, especially in many<br>emerging-market economies. Industrial production and trade were supported by front-loading<br>ahead of higher tariffs. Strong Al-related investment boosted outcomes in the United States and<br>fiscal support in China outweighed the drag from trade headwinds and property market weakness.<br>● US bilateral tariff rates have increased on almost all countries since May. The overall effective US<br>tariff rate rose to an estimated 19.5% at the end of August, the highest rate since 1933.<br>The full effects of tariff increases have yet to be felt - with many changes being phased in over<br>time and companies initially absorbing some tariff increases through margins - but are becoming<br>increasingly visible in spending choices, labour markets and consumer prices.<

In [16]:
docs[0]

Document(metadata={'id': 0, 'page': 1, 'category': 'header', 'html': "<header id='0' style='font-size:18px'>OECD</header>", 'base64_encoding': None, 'image_id': [], 'image_path': [], 'text_summary': [], 'image_summary': []}, page_content='')

In [17]:
for idx, doc in enumerate(docs):
    # print(doc.page_content)
    if doc.metadata["category"] == "figure" or doc.metadata["category"] == "chart" or doc.metadata["category"] == "table":
        output_file = f"{output_prefix}_{doc.metadata['category']}_{idx}.png"
        output_file_path = output_file[1:]

        soup = BeautifulSoup(doc.metadata['html'], 'html.parser')
        if doc.metadata['category'] == 'figure':
            soup.find('img')['src'] = output_file
            replaced_html = str(soup)
            image_path = output_file
            doc.metadata['html'] = replaced_html
            
        elif doc.metadata['category'] == 'chart':
            soup.find('img')['src'] = output_file
            replaced_html = str(soup)
            image_path = output_file
            doc.metadata['html'] = replaced_html
        
        elif doc.metadata['category'] == 'table':
            img = soup.new_tag("img", src=output_file)
            soup.insert(0, img)
            replaced_html = str(soup)
            image_path = output_file
            doc.metadata['html'] = replaced_html
                    
        doc.metadata['image_id'].append(doc.metadata['id'])
        doc.metadata['image_path'].append(image_path)
        
        with open (output_file_path, 'wb') as fh:
            fh.write(base64.decodebytes(str.encode(doc.metadata["base64_encoding"])))
        
    doc.page_content = md(doc.metadata['html'])

In [18]:
output_prefix

'/images/sample_cropped'

In [19]:
docs[53].metadata['image_path']

['/images/sample_cropped_table_53.png']

In [20]:
docs[-1].metadata['base64_encoding']

In [21]:
merged = {}
for doc in docs:
    if doc.metadata['image_path'] is not []:
        bucket = merged.setdefault(doc.metadata['page'], doc.model_copy())  # or clone
        bucket.page_content += "\n\n" + doc.page_content
        bucket.metadata['image_id'].extend(doc.metadata['image_id'])
        bucket.metadata['image_path'].extend(doc.metadata['image_path'])

    else:
        bucket = merged.setdefault(doc.metadata['page'], doc.model_copy())  # or clone
        bucket.page_content += "\n\n" + doc.page_content
objects = list(merged.values())

In [22]:
objects


 Document(metadata={'id': 8, 'page': 4, 'category': 'paragraph', 'html': "<p id='8' data-category='paragraph' style='font-size:20px'>This work is published under the responsibility of the Secretary-General of the OECD. The opinions expressed and<br>arguments employed herein do not necessarily reflect the official views of the Member countries of the OECD.</p>", 'base64_encoding': None, 'image_id': [], 'image_path': [], 'text_summary': [], 'image_summary': []}, page_content='This work is published under the responsibility of the Secretary-General of the OECD. The opinions expressed and  \narguments employed herein do not necessarily reflect the official views of the Member countries of the OECD.\n\nThis work is published under the responsibility of the Secretary-General of the OECD. The opinions expressed and  \narguments employed herein do not necessarily reflect the official views of the Member countries of the OECD.\n\nThis document, as well as any data and map included herein, are w

In [23]:
# remove data from metadata. we don't need that anymore.
for object in objects:
    del object.metadata['base64_encoding']
    del object.metadata['html']
    del object.metadata['category']
    del object.metadata['id']

In [24]:
objects

 Document(metadata={'page': 4, 'image_id': [], 'image_path': [], 'text_summary': [], 'image_summary': []}, page_content='This work is published under the responsibility of the Secretary-General of the OECD. The opinions expressed and  \narguments employed herein do not necessarily reflect the official views of the Member countries of the OECD.\n\nThis work is published under the responsibility of the Secretary-General of the OECD. The opinions expressed and  \narguments employed herein do not necessarily reflect the official views of the Member countries of the OECD.\n\nThis document, as well as any data and map included herein, are without prejudice to the status of or sovereignty over  \nany territory, to the delimitation of international frontiers and boundaries and to the name of any territory, city or area.\n\nThe statistical data for Israel are supplied by and under the responsibility of the relevant Israeli authorities. The use of  \nsuch data by the OECD is without prejudice to

In [25]:
import pickle

with open('outputs/docs.pkl', 'wb') as f:
    pickle.dump(objects, f)

In [26]:
arr = []
for object in objects:
    arr.append(object.page_content)

In [27]:
markdown = "\n\n".join(arr)

In [28]:
with open('outputs/markdown.md', 'w') as f:
    f.write(markdown)