In [None]:
import os
from dotenv import load_dotenv
from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials
from adobe.pdfservices.operation.pdf_services import PDFServices
from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType
from adobe.pdfservices.operation.pdfjobs.jobs.extract_pdf_job import ExtractPDFJob
from adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_pdf_params import ExtractPDFParams
from adobe.pdfservices.operation.pdfjobs.result.extract_pdf_result import ExtractPDFResult


load_dotenv("../config/.env")

In [None]:
# define Services instance
credentials = ServicePrincipalCredentials(
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
)

pdf_services = PDFServices(credentials=credentials)

In [None]:
# read PDF
with open("../data/ctn_20220111.pdf", "rb") as file:
    input_stream = file.read()

In [None]:
# set input PDF
input_asset = pdf_services.upload(
    input_stream=input_stream,
    mime_type=PDFServicesMediaType.PDF,
)

# set desire elements
extract_pdf_params = ExtractPDFParams(
    elements_to_extract=[ExtractElementType.TABLES],
)

# define job instance
job_extract = ExtractPDFJob(
    input_asset=input_asset,
    extract_pdf_params=extract_pdf_params,
)

In [None]:
# submit extraction job
location = pdf_services.submit(job_extract)
response = pdf_services.get_job_result(
    location,
    ExtractPDFResult,
)

# get resulting asserts
asset_result = response.get_result().get_resource()
asset_stream = pdf_services.get_content(asset_result)

In [None]:
# cache extraction result
with open("../data/ctn_20220111.zip", "wb") as file:
    file.write(asset_stream.get_input_stream())