# L4: Pre processing pdfs and Images

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError
from unstructured.partition.html import partition_html
from unstructured.partition.auto import partition
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import dict_to_elements

In [33]:
DLAI_API_KEY = ""
DLAI_API_URL = "https://api.unstructuredapp.io/general/v0/general"
s = UnstructuredClient(
    api_key_auth=DLAI_API_KEY,
    server_url=DLAI_API_URL,
)

## Example Document: News in PDF and HTML

In [34]:
file_name="example/el_nino.html"
html_elements = partition_html(filename=file_name)

In [35]:
for element in html_elements[:10]:
    print(f"{element.category.upper()}: {element.text}")

UNCATEGORIZEDTEXT: CNN 1/30/2024
TITLE: A potent pair of atmospheric rivers will drench California as El NiÃ±o makes its first mark on winter
UNCATEGORIZEDTEXT: By Mary Gilbert, CNN Meteorologist
UNCATEGORIZEDTEXT: Updated: 3:49 PM EST, Tue January 30, 2024
UNCATEGORIZEDTEXT: Source: CNN
NARRATIVETEXT: A potent pair of atmospheric river-fueled storms are about to unleash a windy and incredibly wet week in California in what is the first clear sign of the influence El NiÃ±o was expected to have on the state this winter.
NARRATIVETEXT: The soaking storms will raise the flood threat across much of California into next week, but it appears the wet pattern is likely to continue well into February as a more typical El NiÃ±o pattern kicks into gear.
NARRATIVETEXT: El NiÃ±o â aÂ natural phenomenonÂ in the tropical Pacific that influences weather around the globe â causes changes in the jet stream that can point storms directly at California. Storms can also tap into an extra-potent supply 

## Process the Document with Document Layout Detection

In [36]:
from unstructured_client.models import shared, operations
import json

filename = "example/el_nino.pdf"
with open(filename, "rb") as f:
    files=shared.Files(
        content=f.read(), 
        file_name=filename,
    )
req = operations.PartitionRequest(
    partition_parameters=shared.PartitionParameters(
        files=files,
        strategy="hi_res",
        pdf_infer_table_structure=True,
        languages=["eng"],
    )
)
try:
    resp = s.general.partition(request=req)
    print(json.dumps(resp.elements[:3], indent=2))
except SDKError as e:
    print(e)

INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"


[
  {
    "type": "Header",
    "element_id": "3ce486b1d8985be1007a312b0e6e7f8e",
    "text": "1/30/24, 5:11 PM",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "filename": "el_nino.pdf"
    }
  },
  {
    "type": "Header",
    "element_id": "80fadd89a087f74d635be08917fcddd1",
    "text": "CNN 1/30/2024",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "filename": "el_nino.pdf"
    }
  },
  {
    "type": "Header",
    "element_id": "f879bdbd2f8e77d8da7f08930be3ca0e",
    "text": "Pineapple express: California to get drenched by back-to-back storms fueling a serious \ufb02ood threat | CNN",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "filename": "el_nino.pdf"
    }
  }
]


In [39]:
dld_elements = dict_to_elements(resp.elements)

for element in dld_elements:
    print(f"{element.category.upper()}: {element.text}")

HEADER: 1/30/24, 5:11 PM
HEADER: CNN 1/30/2024
HEADER: Pineapple express: California to get drenched by back-to-back storms fueling a serious ﬂood threat | CNN
TITLE: A potent pair of atmospheric rivers will drench California as El Niño makes its ﬁrst mark on winter
NARRATIVETEXT: By Mary Gilbert, CNN Meteorologist
NARRATIVETEXT: Updated: 3:49 PM EST, Tue January 30, 2024
NARRATIVETEXT: Source: CNN
NARRATIVETEXT: A potent pair of atmospheric river-fueled storms are about to unleash a windy and incredibly wet week in California in what is the ﬁrst clear sign of the inﬂuence El Niño was expected to have on the state this winter.
NARRATIVETEXT: The soaking storms will raise the ﬂood threat across much of California into next week, but it appears the wet pattern is likely to continue well into February as a more typical El Niño pattern kicks into gear.
NARRATIVETEXT: El Niño – a natural phenomenon in the tropical Paciﬁc that inﬂuences weather around the globe – causes changes in the jet st

In [40]:
import collections

In [41]:
len(html_elements)

31

In [42]:
html_categories = [el.category for el in html_elements]
collections.Counter(html_categories).most_common()

[('NarrativeText', 23), ('UncategorizedText', 6), ('Title', 2)]

In [43]:
len(dld_elements)

39

In [44]:
dld_categories = [el.category for el in dld_elements]
collections.Counter(dld_categories).most_common()

[('NarrativeText', 27),
 ('Header', 5),
 ('Title', 3),
 ('Footer', 2),
 ('PageNumber', 2)]