In [1]:
import warnings
warnings.filterwarnings('ignore')

from IPython.display import JSON
import json

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError
from unstructured.partition.html import partition_html

DLAI_API_KEY = ""
DLAI_API_URL = "https://api.unstructured.io/general/v0/general"
s = UnstructuredClient(
    api_key_auth=DLAI_API_KEY,
    server_url=DLAI_API_URL,
)

# PDF Normalization to JSON

In [2]:
from unstructured_client.models import shared, operations

filename = "example/abstract.pdf"
with open(filename, "rb") as f:
    files=shared.Files(
        content=f.read(), 
        file_name=filename,
    )
req = operations.PartitionRequest(
    partition_parameters=shared.PartitionParameters(
        files=files,
        strategy="hi_res",
        pdf_infer_table_structure=True,
        languages=["eng"],
    )
)
try:
    resp = s.general.partition(request=req)
    print(json.dumps(resp.elements[:3], indent=2))
except SDKError as e:
    print(e)


INFO: HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"


[
  {
    "type": "Title",
    "element_id": "93389d1fbd128dc0af6d37c3fb12da5f",
    "text": "Abstract",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "filename": "abstract.pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "e07c885d79b1556d89711da9a09b43d1",
    "text": "Chain-of-Thought powered Retrieval-Augmented Generation for Cardiac Care",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "filename": "abstract.pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "2fd274a869019689158b1e8558506c2e",
    "text": "Objectives:",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "filename": "abstract.pdf"
    }
  }
]


In [3]:
print (resp.elements)

[{'type': 'Title', 'element_id': '93389d1fbd128dc0af6d37c3fb12da5f', 'text': 'Abstract', 'metadata': {'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'filename': 'abstract.pdf'}}, {'type': 'Title', 'element_id': 'e07c885d79b1556d89711da9a09b43d1', 'text': 'Chain-of-Thought powered Retrieval-Augmented Generation for Cardiac Care', 'metadata': {'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'filename': 'abstract.pdf'}}, {'type': 'Title', 'element_id': '2fd274a869019689158b1e8558506c2e', 'text': 'Objectives:', 'metadata': {'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'filename': 'abstract.pdf'}}, {'type': 'NarrativeText', 'element_id': '7b90367c8814655ace8569a18b0beac6', 'text': 'Rapid Evolution in large language models has proven significant advancement in Natural Language Processing in health care professions and clinical settings. However, leveraging these models and ensuring accuracy and interpret-ability remains

In [4]:
JSON(json.dumps(resp.elements, indent=2))

<IPython.core.display.JSON object>

# Power point Normalization to JSON

In [3]:
%pip install python-pptx

Collecting python-pptx
  Using cached python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting Pillow>=3.3.2 (from python-pptx)
  Downloading pillow-11.1.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.1 kB)
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Using cached XlsxWriter-3.2.2-py3-none-any.whl.metadata (2.8 kB)
Using cached python_pptx-1.0.2-py3-none-any.whl (472 kB)
Downloading pillow-11.1.0-cp312-cp312-manylinux_2_28_x86_64.whl (4.5 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hUsing cached XlsxWriter-3.2.2-py3-none-any.whl (165 kB)
Installing collected packages: XlsxWriter, Pillow, python-pptx
Successfully installed Pillow-11.1.0 XlsxWriter-3.2.2 python-pptx-1.0.2
Note: you may need to restart the kernel to use updated packages.


In [5]:
from unstructured.partition.pptx import partition_pptx

filename="example/slides.pptx"
element= partition_pptx(filename=filename)


In [6]:
from unstructured.staging.base import dict_to_elements, elements_to_json
import json

element_dict = [el.to_dict() for el in element]
JSON(json.dumps(element_dict[:], indent=2))

<IPython.core.display.JSON object>

# HTML Normalization to JSON

In [9]:
filename = "example/E06516_Winter_Sports_In_Switzerland_chocr.html"
elements = partition_html(filename=filename)

In [10]:
element_dict = [el.to_dict() for el in elements]
example_output = json.dumps(element_dict[11:15], indent=2)
print(example_output)

[
  {
    "type": "UncategorizedText",
    "element_id": "ceccb7dc920cfe73eb9ec9b57ff95e5f",
    "text": "I N S W I T Z E R L A N D",
    "metadata": {
      "last_modified": "2021-04-03T18:20:27",
      "languages": [
        "cat",
        "cym"
      ],
      "file_directory": "example",
      "filename": "E06516_Winter_Sports_In_Switzerland_chocr.html",
      "filetype": "text/html"
    }
  },
  {
    "type": "UncategorizedText",
    "element_id": "0c1cb38666b70a42b5d04ee95efc9782",
    "text": "B Y",
    "metadata": {
      "last_modified": "2021-04-03T18:20:27",
      "languages": [
        "cat",
        "cym"
      ],
      "file_directory": "example",
      "filename": "E06516_Winter_Sports_In_Switzerland_chocr.html",
      "filetype": "text/html"
    }
  },
  {
    "type": "UncategorizedText",
    "element_id": "ee7ac8da65ee149ee6e7bf3d60814226",
    "text": "E . F . B E N S O N",
    "metadata": {
      "last_modified": "2021-04-03T18:20:27",
      "languages": [
        "ca