# L5: Extracting Tables

In [1]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [2]:
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

from unstructured.staging.base import dict_to_elements

In [4]:
DLAI_API_KEY = ""
DLAI_API_URL = "https://api.unstructuredapp.io/general/v0/general"
s = UnstructuredClient(
    api_key_auth=DLAI_API_KEY,
    server_url=DLAI_API_URL,
)

## Example Document: Embedded Images and Tables

## Process the Document and Extract Tables

In [8]:
from unstructured_client.models import shared, operations
import json

filename = "example/table.pdf"

with open(filename, "rb") as f:
    files=shared.Files(
        content=f.read(),
        file_name=filename,
    )

req = operations.PartitionRequest(
    partition_parameters= shared.PartitionParameters(
        files=files,
        strategy="hi_res",
        hi_res_model_name="yolox",
        skip_infer_table_types=[],
        pdf_infer_table_structure=True,
    )
)

try:
    resp = s.general.partition(request=req)
    elements = dict_to_elements(resp.elements)
except SDKError as e:
    print(e)

INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"


In [9]:
tables = [el for el in elements if el.category == "Table"]

In [11]:
tables[0].text

'Number of Coils Number of Paperclips 5 3, 5, 4 10 7, 8, 6 15 11, 10, 12 20 15, 13, 14'

In [12]:
table_html = tables[0].metadata.text_as_html

In [13]:
from io import StringIO 
from lxml import etree

parser = etree.XMLParser(remove_blank_text=True)
file_obj = StringIO(table_html)
tree = etree.parse(file_obj, parser)
print(etree.tostring(tree, pretty_print=True).decode())

<table>
  <thead>
    <tr>
      <th/>
      <th/>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td/>
      <td/>
    </tr>
    <tr>
      <td/>
      <td/>
    </tr>
    <tr>
      <td/>
      <td/>
    </tr>
    <tr>
      <td/>
      <td/>
    </tr>
  </tbody>
</table>



In [14]:
from IPython.core.display import HTML
HTML(table_html)

In [16]:
# from langchain_openai import ChatOpenAI
# from langchain_core.documents import Document
# from langchain.chains.summarize import load_summarize_chain

In [None]:
# llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-1106")
# chain = load_summarize_chain(llm, chain_type="stuff")
# chain.invoke([Document(page_content=table_html)])