In [59]:
import fitz

def extract_section_by_title(pdf_path, keyword, end_keywords=None):
    """
    Mengambil teks dari bagian yang dimulai dengan `keyword`
    sampai menemukan salah satu `end_keywords`.
    """
    doc = fitz.open(pdf_path)
    extracted_text = []
    capture = False

    for page in doc:
        text = page.get_text("text")
        lines = text.splitlines()
        for line in lines:
            if keyword.lower() in line.lower():
                capture = True
            elif end_keywords and any(end.lower() in line.lower() for end in end_keywords):
                capture = False
            if capture:
                extracted_text.append(line)
    return "\n".join(extracted_text)

# contoh penggunaan:
aset_text = extract_section_by_title(
    "FinancialStatement-2024-Tahunan-EKAD.pdf",
    keyword="Aset",
    end_keywords=["Liabilitas", "Laporan laba rugi"]
)

print(aset_text)  # preview hasilnya

Jumlah Aset
Satuan
Mata
Uang
Persentase
(%)
1
Visko SDN
BHD
Indusri dan
Perdagangan
Malaysia
2009
Operasi
210.341.347.569,5
28
PENUH
IDR
77.95
2
PT.
Ekadharma
Mitra Niaga
Perdagangan
Jakarta
2006
Operasi
3.313.118.536
PENUH
IDR
99.0
[1000000] General information
Informasi umum
General information
31 December 2024
Informasi umum
General information
Nama entitas
Ekadharma International Tbk
Entity name
Penjelasan perubahan nama
dari akhir periode laporan
sebelumnya
Explanation of change in
name from the end of the
preceding reporting period
Kode entitas
EKAD
Entity code
Nomor identifikasi entitas
AA113
Entity identification number
Industri utama entitas
Umum / General
Entity main industry
Standar akutansi yang dipilih
PSAK
Selected accounting
standards
Sektor
B. Basic Materials
Sector
Subsektor
B1. Basic Materials
Subsector
Industri
B11. Chemicals
Industry
Subindustri
B113. Specialty Chemicals
Subindustry
Informasi pemegang saham
pengendali
National Corporation
Controlling shareholder
inf

In [1]:
import pdfplumber
import camelot
import pandas as pd

path = "FinancialStatement-2024-Tahunan-EKAD.pdf"

tables_all = []

with pdfplumber.open(path) as pdf:
    for i, page in enumerate(pdf.pages):
        # Ambil teks referensi (untuk konfirmasi konteks halaman)
        text = page.extract_text() or ""
        
        # Gunakan Camelot (lebih baik deteksi garis dan cell merge)
        tables = camelot.read_pdf(path, flavor='stream', pages=str(i+1), strip_text='\n')
        
        for t in tables:
            df = t.df.copy()
            
            # Deteksi header yang sering rusak
            header_row = df.iloc[0].fillna("").tolist()
            if len(set(header_row)) == 1 and header_row[0] == "":
                # Jika baris pertama kosong, ambil baris kedua
                header_row = df.iloc[1].fillna("").tolist()
                df.columns = header_row
                df = df[2:]
            else:
                df.columns = header_row
                df = df[1:]

            tables_all.append({
                "page": i+1,
                "context": text[:200],
                "table": df.to_dict(orient="records")
            })

# Simpan hasil intermediate
import json
with open("output.json", "w", encoding="utf-8") as f:
    json.dump(tables_all, f, indent=2, ensure_ascii=False)


  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")
  "table": df.to_dict(orient="records")


In [5]:
from pdfminer.high_level import extract_text

pdf_path = "FinancialStatement-2024-Tahunan-EKAD.pdf"
text = extract_text(pdf_path)
print(text[5000:10000])  # tampilkan sebagian teks

using current and
non-current - General Industry

Laporan posisi keuangan

Statement of financial position

31 December 2024

31 December 2023

Laporan posisi keuangan

Aset

Aset lancar

Kas dan setara kas

89,822,670,664

126,639,835,328

Statement of financial position

Assets

Current assets

Cash and cash
equivalents
Notes receivable
Short-term investments

Wesel tagih
Investasi jangka
pendek
Dana yang dibatasi
penggunaannya
lancar
Aset keuangan
lancar

Aset keuangan
lancar yang diukur
pada nilai wajar
melalui laba rugi
Aset keuangan
lancar nilai wajar
melalui
pendapatan
komprehensif
lainnya
Aset keuangan
biaya perolehan
diamortisasi lancar
Aset keuangan
lancar lainnya

Aset keuangan
derivatif lancar
Piutang usaha

Piutang usaha
pihak ketiga
Piutang usaha
pihak berelasi

Piutang sewa
pembiayaan lancar
Piutang retensi

Piutang retensi
pihak ketiga
Piutang retensi
pihak berelasi

Tagihan bruto
pemberi kerja

Tagihan bruto
pemberi kerja
pihak ketiga
Tagihan bruto
pemberi kerja
pihak 

In [7]:
import re
text = extract_text(pdf_path)

# Cari blok "Laporan posisi keuangan"
pattern = r"Laporan posisi keuangan(.*?)Laporan laba rugi"
match = re.search(pattern, text, re.S)
if match:
    balance_text = match.group(1)
    print(balance_text[5000:10000])



omer
receivables

Non-current customer
receivables third
parties
Non-current customer
receivables related
parties

Other non-current
receivables

Other non-current
receivables third
parties
Other non-current
receivables related
parties

Investments accounted
for using equity method

Investments in joint
ventures and
associates

Investments in joint
ventures

Investments in
associates

Non-current guarantees
Non-current advances

Non-current
advances on
investments
Non-current
advances on
purchase of property,
plant and equipment
Other non-current
advances

Non-current financial
assets

Non-current financial
assets at fair value

789,200,996

774,561,067

diukur pada nilai
wajar melalui laba
rugi
Aset keuangan
tidak lancar nilai
wajar melalui
pendapatan
komprehensif
lainnya
Aset keuangan
tidak lancar biaya
perolehan
diamortisasi
Aset keuangan
tidak lancar lainnya

Aset keuangan
derivatif tidak lancar
Biaya dibayar dimuka
tidak lancar
Pajak dibayar dimuka
tidak lancar
Aset pajak tangguh

In [9]:
items = re.findall(r"([A-Za-z\s]+)\s+([\d,().-]+)", balance_text)
for name, value in items:
    print(f"{name.strip()} = {value}")


Statement of financial position = 31
December = 2024
 = 31
December = 2023
Laporan posisi keuangan

Aset

Aset lancar

Kas dan setara kas = 89,822,670,664
 = 126,639,835,328
term investments

Wesel tagih
Investasi jangka
pendek
Dana yang dibatasi
penggunaannya
lancar
Aset keuangan
lancar

Aset keuangan
lancar yang diukur
pada nilai wajar
melalui laba rugi
Aset keuangan
lancar nilai wajar
melalui
pendapatan
komprehensif
lainnya
Aset keuangan
biaya perolehan
diamortisasi lancar
Aset keuangan
lancar lainnya

Aset keuangan
derivatif lancar
Piutang usaha

Piutang usaha
pihak ketiga
Piutang usaha
pihak berelasi

Piutang sewa
pembiayaan lancar
Piutang retensi

Piutang retensi
pihak ketiga
Piutang retensi
pihak berelasi

Tagihan bruto
pemberi kerja

Tagihan bruto
pemberi kerja
pihak ketiga
Tagihan bruto
pemberi kerja
pihak berelasi

Piutang subsidi
Piutang nasabah = 8,108,197,959
 = 42,369,138,649
Current restricted funds = 77,549,504,999
 = 268,978,504,000
 = 313,690,505,999
 = 41,607,602,400

In [8]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer

for page_layout in extract_pages(pdf_path):
    for element in page_layout:
        if isinstance(element, LTTextContainer):
            print(element.get_text())


Nomor Surat

Nama Emiten

Kode Emiten

Perihal

LP-EI/004/III/2025

Ekadharma International Tbk

EKAD

Penyampaian Laporan Keuangan Tahunan

Perseroan dengan ini menyampaikan laporan keuangan untuk Tahun  Bulan yang berakhir pada 31/12/2024 dengan ikhtisar sebagai berikut : 

Informasi mengenai anak perusahaan Perseroan sebagai berikut : 

No

Nama

1 Visko SDN

BHD

2 PT.

Ekadharma
Mitra Niaga

Kegiatan
Usaha

Indusri dan
Perdagangan

Lokasi

Malaysia

Perdagangan

Jakarta

Tahun
Komersil

2009

2006

Status
Operasi

Operasi

Jumlah Aset

Satuan

210.341.347.569,5
28

PENUH

Mata
Uang

IDR

Persentase
(%)

77.95

Operasi

3.313.118.536

PENUH

IDR

99.0

Dokumen ini merupakan dokumen resmi Ekadharma International Tbk yang tidak memerlukan tanda tangan karena dihasilkan secara elektronik.

Ekadharma International Tbk bertanggung jawab penuh atas informasi tertera di dalam dokumen ini. 

  

  

  

  

  

  

[1000000] General information

Informasi umum

Informasi umum
Nama entitas


In [1]:
import tabula
tables = tabula.read_pdf("FinancialStatement-2024-Tahunan-EKAD.pdf", pages='all')
print(tables[3])


Failed to import jpype dependencies. Fallback to subprocess.
No module named 'jpype'
Error from tabula-java:
The operation couldn’t be completed. Unable to locate a Java Runtime.
Please visit http://www.java.com for information on installing Java.





CalledProcessError: Command '['java', '-Djava.awt.headless=true', '-Dfile.encoding=UTF8', '-jar', '/Users/kasyfilalbar/project/fineksi ocr testing/.venv/lib/python3.12/site-packages/tabula/tabula-1.0.5-jar-with-dependencies.jar', '--pages', 'all', '--guess', '--format', 'JSON', 'FinancialStatement-2024-Tahunan-EKAD.pdf']' returned non-zero exit status 1.

In [None]:
import os
import argparse
import subprocess
import logging
from multiprocessing import Pool

import camelot
from PyPDF2 import PdfFileReader
from camelot.core import TableList

def total_pages(pdf):
    with open(pdf, 'rb') as file:
        pdf_object = PdfFileReader(file)
        pages = ','.join([str(i) for i in range(pdf_object.getNumPages())])
    return pages

def extract_tables(pdf, pattern):
    try:
        cmd = f"pdfgrep -Pn '{pattern}' {pdf} | awk -F\":\" '$0~\":\"{{print $1}}' | tr '\n' ','"
        pages = subprocess.check_output(cmd, shell=True).decode("utf-8")
        if not pages:
            logging.warning(f"No matching pages found in {pdf}")
            return


        tables = camelot.read_pdf(pdf, flavor='stream', pages=pages, edge_tol=100)
        filtered = []
        for index, table in enumerate(tables):
            whitespace = tables[index].parsing_report.get('whitespace')
            if whitespace <= 25:
                filtered.append(tables[index])
        filtered_tables = TableList(filtered)
        filtered_tables.export(f"{os.path.splitext(pdf)[0]}.xlsx", f='excel', compress=True)
        logging.info(f"Processed {pdf}")
    except Exception as e:
        logging.error(f"Error processing {pdf}: {str(e)}")

extract_tables("FinancialStatement-2024-Tahunan-EKAD.pdf", "Aset")

/bin/sh: pdfgrep: command not found


In [None]:


raw_text = extract_text_from_pdf("FinancialStatement-2024-Tahunan-EKAD.pdf")

In [48]:
import re

import PyPDF2
def extract_text_from_pdf(pdf_path) -> str:
    """Extract raw text from PDF"""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num, page in enumerate(pdf_reader.pages):
            text += f"\n--- PAGE {page_num + 1} ---\n"
            text += page.extract_text()
    raw_text = text
    return raw_text

def _extract_section(raw_text, pattern: str) -> str:
        """Extract text for a specific section using regex"""
        match = re.search(pattern, raw_text, re.IGNORECASE)
        if match:
            start = match.start()
            # Find next section marker or end of document
            next_section = re.search(
                r'\n\[1[0-9]{6}\]|\nNotes to|---',
                raw_text[start+100:]
            )
            end = next_section.start() + start + 1000 if next_section else len(raw_text)
            return raw_text[start:end]
        return ""

def segment_financial_sections(raw_text: str) -> dict:

    """Segment PDF text by financial statement type"""
    sections = {
        'Assets': _extract_section(raw_text, 'Assets'),
        'liabilities': _extract_section(raw_text, 'Liabilities'),
        'equitas': _extract_section(raw_text, 'Equity'),
        'IncomeStatement': _extract_section(raw_text, 'Statement of profit or loss'),
        'CashFlow': _extract_section(raw_text, "Statement of cash flows")
    }
    return {k: v for k, v in sections.items() if v}

In [49]:
raw_text = extract_text_from_pdf("FinancialStatement-2024-Tahunan-EKAD.pdf")

In [50]:
raw_text

"\n--- PAGE 1 ---\n  \nPerseroan dengan ini menyampaikan laporan keuangan untuk Tahun  Bulan yang berakhir pada 31/12/2024 dengan ikhtisar sebagai berikut : \n  \nInformasi mengenai anak perusahaan Perseroan sebagai berikut : \n  \n  \n  \n  \nDokumen ini merupakan dokumen resmi Ekadharma International Tbk yang tidak memerlukan tanda tangan karena dihasilkan secara elektronik.\nEkadharma International Tbk bertanggung jawab penuh atas informasi tertera di dalam dokumen ini. Nomor Surat LP-EI/004/III/2025\nNama Emiten Ekadharma International Tbk\nKode Emiten EKAD\nPerihal Penyampaian Laporan Keuangan Tahunan\nNoNama Kegiatan\nUsahaLokasi Tahun\nKomersilStatus\nOperasiJumlah Aset Satuan Mata\nUangPersentase\n(%)\n1Visko SDN\nBHDIndusri dan\nPerdaganganMalaysia 2009 Operasi 210.341.347.569,5\n28PENUH IDR 77.95\n2PT.\nEkadharma\nMitra NiagaPerdagangan Jakarta 2006 Operasi 3.313.118.536 PENUH IDR 99.0\n--- PAGE 2 ---\n[1000000] General information\nInformasi umum General information\n31 Dece

In [51]:
segment_results = segment_financial_sections(raw_text)

In [52]:
segment_results.get('Assets', '')

'Assets\nAset lancar Current assets\nKas dan setara kas 89,822,670,664 126,639,835,328 Cash and cash\nequivalents\nWesel tagih Notes receivable\nInvestasi jangka\npendekShort-term investments\nDana yang dibatasi\npenggunaannya\nlancar8,108,197,959 42,369,138,649 Current restricted funds\nAset keuangan\nlancarCurrent financial\nassets\nAset keuangan\nlancar yang diukur\npada nilai wajar\nmelalui laba rugi77,549,504,999 Current financial\nassets at fair value\nthrough profit or loss\nAset keuangan\nlancar nilai wajar\nmelalui\npendapatan\nkomprehensif\nlainnya268,978,504,000 313,690,505,999 Current financial\nassets fair value\nthrough other\ncomprehensive\nincome\nAset keuangan\nbiaya perolehan\ndiamortisasi lancarCurrent financial\nassets amortized cost\ninvestments\nAset keuangan\nlancar lainnya41,607,602,400 Other current\nfinancial assets\nAset keuangan\nderivatif lancarCurrent derivative\nfinancial assets\nPiutang usaha Trade receivables\nPiutang usaha\npihak ketiga71,776,497,049 6

In [None]:
import requests
key = "Assets"
prompt = f"""
    Find the financial keys in this {key} segment: {segment_results.get(key, '')},
    There will be 2 values each financial keys, the first is the current year, the second is the previous year,
    You should return the values with definition it is current year or previous year,
    return your response in ONLY valid JSON format with key names and values:
    {{key: {{ "current_year": value2, "previous_year": value1 }}}}
"""

data = {
    "model": "llama3.1:8b",
    "prompt": prompt,
    "stream": False,
    "temperature": 0.1
}

response = requests.post("http://localhost:11434/api/generate", json=data).json()
json_match = re.search(r'\{.*\}', response['response'], re.DOTALL)
print(json_match)


<re.Match object; span=(57, 3387), match='{\n  "Cash and cash equivalents": {\n    "current>


In [96]:
import json
json_match = re.search(r'\{.*\}', response['response'], re.DOTALL)
print(json.loads(json_match.group()))

{'Cash and cash equivalents': {'current_year': 126639835328, 'previous_year': 89822670664}, 'Notes receivable': {'current_year': None, 'previous_year': None}, 'Short-term investments': {'current_year': 42369138649, 'previous_year': 8108197959}, 'Current restricted funds': {'current_year': 426391386649, 'previous_year': 8110197959}, 'Current financial assets': {'current_year': 77549504999, 'previous_year': None}, 'Current financial assets at fair value through profit or loss': {'current_year': 313690505999, 'previous_year': 775449504999}, 'Current financial assets fair value through other comprehensive income': {'current_year': 313690505999, 'previous_year': 268978504000}, 'Current financial assets amortized cost investments': {'current_year': None, 'previous_year': None}, 'Other current financial assets': {'current_year': 31607560400, 'previous_year': 41607602400}, 'Current derivative financial assets': {'current_year': None, 'previous_year': None}, 'Trade receivables': {'current_year'