In [1]:
!pip install python-dotenv pytesseract langchain "unstructured[all-docs]" pydantic lxml langchainhub pdf2image

Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting pytesseract
  Using cached pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting langchain
  Using cached langchain-0.3.21-py3-none-any.whl.metadata (7.8 kB)
Collecting pydantic
  Using cached pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)
Collecting lxml
  Using cached lxml-5.3.1-cp311-cp311-win_amd64.whl.metadata (3.8 kB)
Collecting langchainhub
  Using cached langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting pdf2image
  Using cached pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting unstructured[all-docs]
  Using cached unstructured-0.17.2-py3-none-any.whl.metadata (24 kB)
Collecting Pillow>=8.0.0 (from pytesseract)
  Using cached pillow-11.1.0-cp311-cp311-win_amd64.whl.metadata (9.3 kB)
Collecting langchain-core<1.0.0,>=0.3.45 (from langchain)
  Downloading langchain_core-0.3.48-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-t

In [2]:
# Set the path to the Tesseract executable
import pytesseract
import cv2
from unstructured.partition.pdf import partition_pdf

import os
from dotenv import load_dotenv
load_dotenv()

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
PDF_PATH = "../data/UNTR.pdf"

In [25]:
from pdf2image import convert_from_path

# Function to split PDF into images
def split_pdf(pdf_path):
    pages = convert_from_path(pdf_path)
    page_files = []
    for i, page in enumerate(pages):
        page_path = f"img/page_{i}.png"
        page.save(page_path, "PNG")
        page_files.append(f"{page_path}")
    return page_files

pages_path = split_pdf(PDF_PATH)
print(pages_path)

['img/page_0.png', 'img/page_1.png', 'img/page_2.png']


In [26]:
tesseract_text = []
for page in pages_path:
  page_img = cv2.imread(page)
  result = pytesseract.image_to_string(page_img)
  tesseract_text.append(result)

print(tesseract_text)

['PT UNITED TRACTORS Tbk\nDAN ENTITAS ANAK/AND SUBSIDIARIES\n\nLAPORAN KEUANGAN KONSOLIDASIAN INTERIM/\nINTERIM CONSOLIDATED FINANCIAL STATEMENTS\n\nPADA 31 MARET 2024 DAN 31 DESEMBER 2023/\nAS AT 31 MARCH 2024 AND 31 DECEMBER 2023\n\nDAN/AND\nUNTUK PERIODE TIGA BULAN YANG BERAKHIR/\n\nFOR THE THREE-MONTH PERIODS ENDED\n31 MARET/MARCH 2024 DAN/AND 2023\n', 'member of ASTRA\n\nF © unrtep TRACTORS\n\nPT UNITED TRACTORS Tbk DAN ENTITAS ANAK\n\nPERNYATAAN DIREKSI\nTENTANG TANGGUNG JAWAB TERHADAP\nLAPORAN KEUANGAN\nKONSOLIDASIAN INTERIM\nPT UNITED TRACTORS Tbk\nDAN ENTITAS ANAK (“GRUP”)\nTANGGAL 31 MARET 2024 DAN 31 DESEMBER 2023\nSERTA PERIODE-PERIODE TIGA BULAN\nYANG BERAKHIR 31 MARET 2024 DAN 2023\n\nPT UNITED TRACTORS Tbk AND SUBSIDIARIES\n\nBOARD OF DIRECTORS’ STATEMENT\nREGARDING THE RESPONSIBILITY FOR\nTHE INTERIM CONSOLIDATED\nFINANCIAL STATEMENTS\nOF PT UNITED TRACTORS Tbk\nAND SUBSIDIARIES (THE “GROUP”)\n\nAS AT 31 MARCH 2024 AND 31 DECEMBER 2023\nAND FOR THE THREE-MONTH PERIODS\n

In [27]:
def element_extractor(pdf_path):
    # Partitioning file
    raw_pdf_elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        extract_images_in_pdf=True,
        infer_table_structure=True,
        languages=["eng", "ind"]
    )

    # split raw_pdf_elements per page_number
    element_pages = []

    for element in raw_pdf_elements:
        page_number = element.metadata.page_number
        if len(element_pages) < page_number:
            element_pages.append([])
        element_pages[page_number - 1].append(element)

    return element_pages

extracted_elements = element_extractor(PDF_PATH)
extracted_elements

[[<unstructured.documents.elements.NarrativeText at 0x27152460990>,
  <unstructured.documents.elements.NarrativeText at 0x271524624d0>,
  <unstructured.documents.elements.NarrativeText at 0x27152461590>,
  <unstructured.documents.elements.Title at 0x27152462450>,
  <unstructured.documents.elements.NarrativeText at 0x27152462c90>],
 [<unstructured.documents.elements.Text at 0x27140f0d550>,
  <unstructured.documents.elements.Title at 0x27152463350>,
  <unstructured.documents.elements.NarrativeText at 0x2715243ebd0>,
  <unstructured.documents.elements.Title at 0x2715243c750>,
  <unstructured.documents.elements.Title at 0x2714cd03790>,
  <unstructured.documents.elements.Table at 0x2715243e390>,
  <unstructured.documents.elements.Text at 0x27152460050>,
  <unstructured.documents.elements.NarrativeText at 0x2715243f0d0>,
  <unstructured.documents.elements.NarrativeText at 0x27147933c90>,
  <unstructured.documents.elements.ListItem at 0x2715243f350>,
  <unstructured.documents.elements.Text at

In [28]:
# Combine the text extracted by Tesseract with the structured data extracted by Unstructured
# check for image position
ocr_text = tesseract_text[0]
print(ocr_text)

PT UNITED TRACTORS Tbk
DAN ENTITAS ANAK/AND SUBSIDIARIES

LAPORAN KEUANGAN KONSOLIDASIAN INTERIM/
INTERIM CONSOLIDATED FINANCIAL STATEMENTS

PADA 31 MARET 2024 DAN 31 DESEMBER 2023/
AS AT 31 MARCH 2024 AND 31 DECEMBER 2023

DAN/AND
UNTUK PERIODE TIGA BULAN YANG BERAKHIR/

FOR THE THREE-MONTH PERIODS ENDED
31 MARET/MARCH 2024 DAN/AND 2023



In [29]:
element = "\n\n".join([str(el) for el in extracted_elements[0]])
print(element)

PT UNITED TRACTORS Tbk DAN ENTITAS ANAK/AND SUBSIDIARIES

LAPORAN KEUANGAN KONSOLIDASIAN INTERIM/ INTERIM CONSOLIDATED FINANCIAL STATEMENTS

PADA 31 MARET 2024 DAN 31 DESEMBER 2023/ AS AT 31 MARCH 2024 AND 31 DECEMBER 2023

DAN/AND

UNTUK PERIODE TIGA BULAN YANG BERAKHIR/ FOR THE THREE-MONTH PERIODS ENDED 31 MARET/MARCH 2024 DAN/AND 2023


In [30]:
ocr_text_list = ocr_text.split("\n")
ocr_text_list

['PT UNITED TRACTORS Tbk',
 'DAN ENTITAS ANAK/AND SUBSIDIARIES',
 '',
 'LAPORAN KEUANGAN KONSOLIDASIAN INTERIM/',
 'INTERIM CONSOLIDATED FINANCIAL STATEMENTS',
 '',
 'PADA 31 MARET 2024 DAN 31 DESEMBER 2023/',
 'AS AT 31 MARCH 2024 AND 31 DECEMBER 2023',
 '',
 'DAN/AND',
 'UNTUK PERIODE TIGA BULAN YANG BERAKHIR/',
 '',
 'FOR THE THREE-MONTH PERIODS ENDED',
 '31 MARET/MARCH 2024 DAN/AND 2023',
 '']