## Unstrucutred Library: Working with PDFs

In [1]:
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json
from unstructured.chunking.title import chunk_by_title
from collections import Counter
import time
from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


### "Fast" startegy - Extract text directly if available

In [2]:
pdf_file_path = "./travel_health_insurance_policy.pdf"

In [7]:
# Returns a List[Element] present in the pages of the parsed pdf document
start_time = time.time()
elements = partition_pdf(pdf_file_path)
elapsed_time = time.time() - start_time

elements

[<unstructured.documents.elements.Text at 0x26934668dc0>,
 <unstructured.documents.elements.NarrativeText at 0x26934668d30>,
 <unstructured.documents.elements.Text at 0x2693466ffd0>,
 <unstructured.documents.elements.Title at 0x26934538040>,
 <unstructured.documents.elements.Title at 0x26934540be0>,
 <unstructured.documents.elements.Title at 0x2693454eee0>,
 <unstructured.documents.elements.Title at 0x2693454e2b0>,
 <unstructured.documents.elements.Title at 0x269342a5730>,
 <unstructured.documents.elements.Text at 0x26934525610>,
 <unstructured.documents.elements.Text at 0x26934525ac0>,
 <unstructured.documents.elements.ListItem at 0x2693452a1c0>,
 <unstructured.documents.elements.Title at 0x269342a5880>,
 <unstructured.documents.elements.ListItem at 0x26934303a60>,
 <unstructured.documents.elements.Text at 0x2693429f370>,
 <unstructured.documents.elements.ListItem at 0x2693428aa30>,
 <unstructured.documents.elements.Text at 0x26934258f10>,
 <unstructured.documents.elements.Text at 0x2

Mapped Element types:
* "UncategorizedText" -> "Text"

In [9]:
print("Elapsed time: ", elapsed_time)

Elapsed time:  4.2170631885528564


In [16]:
print("Number of elements: ", len(elements))

Number of elements:  289


In [15]:
for el in elements[:2]:
    print("Element Text: ", str(el))
    print("Element ID: ", el.id)
    print("Element Metadata: ", el.metadata, "\n\n")
    print("Element as Dictionary: ")
    pprint(el.to_dict())
    print("\n ------------------------------------- \n")

Element Text:  Table of Contents SUMMARY OF COVER ...................................................................................................2 GENERAL PROCEDURE – HOW TO FILE A CLAIM .............................................................4 INTRODUCTORY CLAUSES .............................................................................................6 GENERAL DEFINITIONS..................................................................................................8 COVERAGE..................................................................................................................12 SECTION 1 – PERSONAL ACCIDENT BENEFITS ...............................................................12 SECTION 2 – MEDICAL & RELATED BENEFITS ................................................................14 EMERGENCY MEDICAL EXPENSES (ACCIDENT & SICKNESS)................................14 EMERGENCY MEDICAL EVACUATION................................................................

In [13]:
unique_elements = [str(type(el)) for el in elements]
counts = Counter(unique_elements)
pprint(counts)

Counter({"<class 'unstructured.documents.elements.NarrativeText'>": 164,
         "<class 'unstructured.documents.elements.Text'>": 53,
         "<class 'unstructured.documents.elements.Title'>": 52,
         "<class 'unstructured.documents.elements.ListItem'>": 20})


In [14]:
# get output as json
elements_to_json(elements, filename="./pdf_elements_unstructured_fast.json")

### "hi_res" startegy - Use OCR & layout detection to extract text

In [None]:
# Define parameters for Unstructured's library

## `include_page_breaks`
# include page breaks (default is False)
include_page_breaks = True

## `strategy`
# The strategy to use for partitioning the PDF. Valid strategies are "hi_res", "ocr_only", and "fast".
# When using the "hi_res" strategy, the function uses a layout detection model to identify document elements.
# hi_res" is used for analyzing PDFs and extracting table structure (default is "auto")
strategy = "hi_res"

## `infer_table_structure`
# Only applicable if `strategy=hi_res`.
# If True, any Table elements that are extracted will also have a metadata field named "text_as_html" where the table's text content is rendered into an html string.
# I.e., rows and cells are preserved.
# Whether True or False, the "text" field is always present in any Table element and is the text content of the table (no structure).

if strategy == "hi_res": 
    infer_table_structure = True
else: 
    infer_table_structure = False

## `extract_element_types`
# Get images of tables
if infer_table_structure is True: 
    extract_element_types=['Table']
else: 
    extract_element_types=None

## `max_characters`
# The maximum number of characters to include in a partition (document element)
# If None is passed, no maximum is applied.
# Only applies to the "ocr_only" strategy (default is 1500)
if strategy != "ocr_only": 
    max_characters = None

## `languages`
# The languages to use for the Tesseract agent.
# To use a language, you'll first need to install the appropriate Tesseract language pack.
languages = ["eng"] # example if more than one "eng+por" (default is "eng")

## `model_name`
# @requires_dependencies("unstructured_inference")
# yolox: best model for table extraction. Other options are yolox_quantized, detectron2_onnx and chipper depending on file layout
# source: https://unstructured-io.github.io/unstructured/best_practices/models.html
model_name = "yolox"

In [17]:
# This will preserve the structure of the tables.
# It will use a combination of computer vision and Optical Character Recognition (OCR) to extract the tables and maintain the structure.
start_time = time.time()
elements = partition_pdf(pdf_file_path, strategy="hi_res")
elapsed_time = time.time() - start_time

elements

[<unstructured.documents.elements.ListItem at 0x26944112a30>,
 <unstructured.documents.elements.ListItem at 0x269441122e0>,
 <unstructured.documents.elements.ListItem at 0x26944112a90>,
 <unstructured.documents.elements.ListItem at 0x26944112b20>,
 <unstructured.documents.elements.ListItem at 0x26944112bb0>,
 <unstructured.documents.elements.ListItem at 0x26944112c40>,
 <unstructured.documents.elements.ListItem at 0x26944112cd0>,
 <unstructured.documents.elements.ListItem at 0x26944112d60>,
 <unstructured.documents.elements.ListItem at 0x26944112df0>,
 <unstructured.documents.elements.ListItem at 0x26944112e80>,
 <unstructured.documents.elements.ListItem at 0x26944112f10>,
 <unstructured.documents.elements.ListItem at 0x26944112fa0>,
 <unstructured.documents.elements.ListItem at 0x2694411b070>,
 <unstructured.documents.elements.ListItem at 0x2694411b100>,
 <unstructured.documents.elements.ListItem at 0x2694411b190>,
 <unstructured.documents.elements.ListItem at 0x2694411b220>,
 <unstru

In [22]:
print("Elapsed time: ", elapsed_time)
print("Number of elements: ", len(elements))

Elapsed time:  419.55390763282776
Number of elements:  337


In [20]:
unique_elements = [str(type(el)) for el in elements]
counts = Counter(unique_elements)
pprint(counts)

Counter({"<class 'unstructured.documents.elements.Text'>": 219,
         "<class 'unstructured.documents.elements.ListItem'>": 49,
         "<class 'unstructured.documents.elements.NarrativeText'>": 47,
         "<class 'unstructured.documents.elements.Title'>": 19,
         "<class 'unstructured.documents.elements.Image'>": 2,
         "<class 'unstructured.documents.elements.Table'>": 1})


In [21]:
# get output as json
elements_to_json(elements, filename="./pdf_elements_unstructured_hi_res.json")

## Chunking

### Chunking with "Fast" strategy

##### Chunking inside `partition_pdf()`

In [4]:
elements = partition_pdf(
    filename=pdf_file_path,
    strategy="fast",
    infer_table_structure=False,   # this will enable strategy="hi_res"
    extract_images_in_pdf=False,
    # Post processing to aggregate text once we have the title 
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    max_characters=4000,             # Require maximum chunk size of 4000 chars
    new_after_n_chars=3800,          # Attempt to create a new chunk at 3800 chars
    combine_text_under_n_chars=2000, # Attempt to keep chunks > 2000 chars
)

elements

[<unstructured.documents.elements.CompositeElement at 0x1f3d20ae790>,
 <unstructured.documents.elements.CompositeElement at 0x1f3d20ae8e0>,
 <unstructured.documents.elements.CompositeElement at 0x1f3d05f1e50>,
 <unstructured.documents.elements.CompositeElement at 0x1f3d05f1c10>,
 <unstructured.documents.elements.CompositeElement at 0x1f3d05f1dc0>,
 <unstructured.documents.elements.CompositeElement at 0x1f3d05f1820>,
 <unstructured.documents.elements.CompositeElement at 0x1f3d05f16a0>,
 <unstructured.documents.elements.CompositeElement at 0x1f3d05f1a90>,
 <unstructured.documents.elements.CompositeElement at 0x1f3d05f1fd0>,
 <unstructured.documents.elements.CompositeElement at 0x1f3d05f1910>,
 <unstructured.documents.elements.CompositeElement at 0x1f3d05f12b0>,
 <unstructured.documents.elements.CompositeElement at 0x1f3d05f10d0>,
 <unstructured.documents.elements.CompositeElement at 0x1f3d05f1310>,
 <unstructured.documents.elements.CompositeElement at 0x1f3d05f1280>,
 <unstructured.docum

In [5]:
print("Number of elements: ", len(elements))

Number of elements:  17


In [11]:
unique_elements = [str(type(el)) for el in elements]
counts = Counter(unique_elements)
pprint(counts)

elements_lengths = [len(str(el)) for el in elements]
print("Elements Lengths: ", elements_lengths)
print("Min Length of an Element: ", min(elements_lengths)) 
print("Max Length of an Element: ", max(elements_lengths))

Counter({"<class 'unstructured.documents.elements.CompositeElement'>": 17})
Elements Lengths:  [4000, 616, 2045, 2060, 2300, 3816, 3907, 3898, 2009, 1020, 3022, 2609, 2515, 3440, 2211, 2115, 2139]
Min Length of an Element:  616
Max Length of an Element:  4000


In [9]:
for el in elements[:4]:
    print("Element Text: ", str(el))
    print("Element ID: ", el.id)
    print("Element Metadata: ", el.metadata, "\n\n")
    print("Element as Dictionary: ")
    pprint(el.to_dict())
    print("\n ------------------------------------- \n")

Element Text:  Table of Contents SUMMARY OF COVER ...................................................................................................2 GENERAL PROCEDURE – HOW TO FILE A CLAIM .............................................................4 INTRODUCTORY CLAUSES .............................................................................................6 GENERAL DEFINITIONS..................................................................................................8 COVERAGE..................................................................................................................12 SECTION 1 – PERSONAL ACCIDENT BENEFITS ...............................................................12 SECTION 2 – MEDICAL & RELATED BENEFITS ................................................................14 EMERGENCY MEDICAL EXPENSES (ACCIDENT & SICKNESS)................................14 EMERGENCY MEDICAL EVACUATION................................................................

In [7]:
# get output as json
elements_to_json(elements, filename="./pdf_elements_unstructured_fast_chunked.json")

##### Chunking using `chunk_by_title()`

In [14]:
elements = partition_pdf(pdf_file_path)

In [15]:
chunks = chunk_by_title(
    elements=elements,
    max_characters=4000,             # Require maximum chunk size of 4000 chars
    new_after_n_chars=3800,          # Attempt to create a new chunk at 3800 chars
    combine_text_under_n_chars=2000, # Attempt to keep chunks > 2000 chars
    )

In [16]:
unique_elements = [str(type(el)) for el in chunks]
counts = Counter(unique_elements)
pprint(counts)

elements_lengths = [len(str(el)) for el in chunks]
print("Elements Lengths: ", elements_lengths)
print("Min Length of an Element: ", min(elements_lengths)) 
print("Max Length of an Element: ", max(elements_lengths))

Counter({"<class 'unstructured.documents.elements.CompositeElement'>": 17})
Elements Lengths:  [4000, 616, 2045, 2060, 2300, 3816, 3907, 3898, 2009, 1020, 3022, 2609, 2515, 3440, 2211, 2115, 2139]
Min Length of an Element:  616
Max Length of an Element:  4000


**We can see that chunking inside `partition_pdf()` gives the same result as chunking separately using `chunk_by_title()`**

### Chunking with "hi_res" strategy

In [3]:
start_time = time.time()
elements = partition_pdf(
    filename=pdf_file_path,
    strategy="hi_res",
    infer_table_structure=True,   # this will enable strategy="hi_res"
    extract_images_in_pdf=False,
    # Post processing to aggregate text once we have the title 
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    max_characters=4000,             # Require maximum chunk size of 4000 chars
    new_after_n_chars=3800,          # Attempt to create a new chunk at 3800 chars
    combine_text_under_n_chars=2000, # Attempt to keep chunks > 2000 chars
)
elapsed_time = time.time() - start_time

elements

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[<unstructured.documents.elements.TableChunk at 0x117f8dc9160>,
 <unstructured.documents.elements.TableChunk at 0x117f8dc9670>,
 <unstructured.documents.elements.CompositeElement at 0x117f8dc9460>,
 <unstructured.documents.elements.Table at 0x11788a61430>,
 <unstructured.documents.elements.CompositeElement at 0x117f8dc9130>,
 <unstructured.documents.elements.Table at 0x11788c26580>,
 <unstructured.documents.elements.CompositeElement at 0x117f8dc9250>,
 <unstructured.documents.elements.CompositeElement at 0x117f8dc9730>,
 <unstructured.documents.elements.CompositeElement at 0x117f8dc9cd0>,
 <unstructured.documents.elements.CompositeElement at 0x117f8dc9f10>,
 <unstructured.documents.elements.CompositeElement at 0x117f8dc9fd0>,
 <unstructured.documents.elements.CompositeElement at 0x117f8dc99d0>,
 <unstructured.documents.elements.CompositeElement at 0x117f8dc94f0>,
 <unstructured.documents.elements.Table at 0x11788bfad30>,
 <unstructured.documents.elements.CompositeElement at 0x117f8dc9f

In [4]:
print("Elapsed time: ", elapsed_time)
print("Number of elements: ", len(elements))

Elapsed time:  70.43373465538025
Number of elements:  21


In [6]:
unique_elements = [str(type(el)) for el in elements]
counts = Counter(unique_elements)
pprint(counts)

elements_lengths = [len(str(el)) for el in elements]
print("Elements Lengths: ", elements_lengths)
print("Min Length of an Element: ", min(elements_lengths)) 
print("Max Length of an Element: ", max(elements_lengths))

Counter({"<class 'unstructured.documents.elements.CompositeElement'>": 16,
         "<class 'unstructured.documents.elements.Table'>": 3,
         "<class 'unstructured.documents.elements.TableChunk'>": 2})
Elements Lengths:  [4000, 616, 172, 1651, 394, 573, 3138, 3463, 837, 3904, 3898, 2009, 515, 492, 3022, 2615, 2507, 620, 3959, 3181, 2139]
Min Length of an Element:  172
Max Length of an Element:  4000


In [7]:
for el in elements[:4]:
    print("Element Text: ", str(el))
    print("Element ID: ", el.id)
    print("Element Metadata: ", el.metadata, "\n\n")
    print("Element as Dictionary: ")
    pprint(el.to_dict())
    print("\n ------------------------------------- \n")

Element Text:  Table of Contents SUMMARY OF COVER ...................................................................................................2 GENERAL PROCEDURE – HOW TO FILE A CLAIM .............................................................4 INTRODUCTORY CLAUSES .............................................................................................6 GENERAL DEFINITIONS..................................................................................................8 COVERAGE..................................................................................................................12 SECTION 1 – PERSONAL ACCIDENT BENEFITS ...............................................................12 SECTION 2 – MEDICAL & RELATED BENEFITS ................................................................14 EMERGENCY MEDICAL EXPENSES (ACCIDENT & SICKNESS)................................14 EMERGENCY MEDICAL EVACUATION................................................................

In [8]:
# get output as json
elements_to_json(elements, filename="./pdf_elements_unstructured_hi_res_chunked.json")