In [1]:
print("ok")

ok


In [2]:
import unstructured
from importlib.metadata import version

In [3]:
print(version("unstructured"))

0.18.15


In [4]:
import os
from dotenv import load_dotenv

if os.getenv('AZURE_OPENAI_API_KEY'):
    print("AzureOpenAI API key loaded successfully")
else:
    print("API KEY not found")

AzureOpenAI API key loaded successfully


In [5]:
from unstructured.partition.auto import partition
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.html import partition_html
from unstructured.partition.md import partition_md
from unstructured.partition.docx import partition_docx
from unstructured.partition.xlsx import partition_xlsx
from unstructured.partition.pptx import partition_pptx
from unstructured.partition.image import partition_image

from unstructured.chunking.title import chunk_by_title
from unstructured.chunking.basic import chunk_elements

from unstructured.documents.elements import (
    Title,
    NarrativeText,
    Table,
    ListItem,
    Image,
    Header,
    Footer,
    Text,
    ElementMetadata
)

import json
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
html_path = 'sample_documents\sample.html'
elements = partition(html_path)

for i, element in enumerate(elements,1):
    element_type = type(element).__name__
    text_preview = element.text[:100]+"..." if len(element.text)> 100 else element.text
    print(f"[{i}] {element_type}: {text_preview}")
    print("-"*60)


libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.


[1] Title: 1. Overview
------------------------------------------------------------
[2] NarrativeText: Document parsing is the process of analyzing and extracting structured information from various docu...
------------------------------------------------------------
[3] Title: 1.1 Key Benefits
------------------------------------------------------------
[4] ListItem: Automated data extraction
------------------------------------------------------------
[5] ListItem: Structured content analysis
------------------------------------------------------------
[6] ListItem: Integration with AI/ML pipelines
------------------------------------------------------------
[7] ListItem: Support for multiple formats
------------------------------------------------------------
[8] Title: 2. Core Features
------------------------------------------------------------
[9] NarrativeText: Modern document parsers offer a variety of features:
------------------------------------------------------------
[10] 

In [7]:
title_element = [e for e in elements if isinstance(e, Title)]

if title_element:
    first_title = title_element[0]

    print("Element Text: ")
    print(first_title.text)
    print('-'*60)

    print("Element Metadata: ")
    meta_data = first_title.metadata

    print(meta_data)
    meta_data_dict = meta_data.to_dict()
    for key, value in meta_data_dict.items():
        if value is not None:
            print(f"   {key}: {value}")

Element Text: 
1. Overview
------------------------------------------------------------
Element Metadata: 
<unstructured.documents.elements.ElementMetadata object at 0x000001D6FE63DFD0>
   category_depth: 1
   last_modified: 2025-12-30T05:14:11
   languages: ['eng']
   file_directory: sample_documents
   filename: sample.html
   filetype: text/html


In [8]:
def analyze_elements(elements):
    type_count = {}
    for element in elements:
        element_type = type(element).__name__
        type_count[element_type] = type_count.get(element_type,0) + 1
    print("Element Distribution: ")
    print("-"*30)
    for elem_type, count in sorted(type_count.items()):
        print(f"  {elem_type}: {count}")
    return type_count

In [9]:
type_count = analyze_elements(elements)

Element Distribution: 
------------------------------
  ListItem: 8
  NarrativeText: 6
  Table: 1
  Title: 5


In [10]:
narrative_texts = [e for e in elements if isinstance(e, NarrativeText)]
print(f"Found {len(narrative_texts)} Narrative elements")
for i, text in enumerate(narrative_texts[:5],1):
    print(f"{i}. {text.text}\n")

Found 6 Narrative elements
1. Document parsing is the process of analyzing and extracting structured information from various document formats. This includes PDFs, Word documents, HTML pages, and more.

2. Modern document parsers offer a variety of features:

3. Retrieval-Augmented Generation (RAG) systems benefit significantly from proper document parsing:

4. "Effective document parsing is the foundation of any successful RAG implementation."

5. Here's a simple example of using a document parser:



In [11]:
table_elements = [e for e in elements if isinstance(e,Table)]
print(f"Found {len(table_elements)} Table elements: \n")
for i, table in enumerate(table_elements):
    print(f"Table {i+1}: \n")
    print(table.text)
    print()

    if hasattr(table.metadata, 'text_as_html') and table.metadata.text_as_html:
        print("HTML representation available in metadata.text_as_html")
    print("-"*60)

Found 1 Table elements: 

Table 1: 

Feature Description Use Case OCR Support Optical Character Recognition for scanned documents Scanned PDFs, Images Table Extraction Structured table data extraction Financial reports, Data tables Layout Analysis Understanding document structure Academic papers, Legal documents Image Processing Extract and classify images Technical manuals, Presentations

HTML representation available in metadata.text_as_html
------------------------------------------------------------


### **Partitioning Strategies**

In [12]:
import urllib.request
import os

os.makedirs("sample_documents", exist_ok=True)
pdf_url = "https://arxiv.org/pdf/2408.09869"
pdf_path = "sample_documents\docling_paper.pdf"

if not os.path.exists(pdf_path):
    print(f"Downloading PDF from {pdf_url}")
    urllib.request.urlretrieve(pdf_url)
    print(f"Downloaded to {pdf_path}")
else:
    print(f"PDF exist at: {pdf_path}")

PDF exist at: sample_documents\docling_paper.pdf


In [13]:
import time
print("Strategy: AUTO (default)")
print("="*60)

start_time = time.time()
element_auto = partition(
    filename=pdf_path,
    strategy="auto"
)
elapsed_time = time.time()-start_time

print(f"Time taken: {elapsed_time:.2f} seconds")
print(f"Elements extracted: {len(element_auto)}")

analyze_elements(element_auto)

Strategy: AUTO (default)
Time taken: 2.05 seconds
Elements extracted: 276
Element Distribution: 
------------------------------
  Footer: 10
  ListItem: 4
  NarrativeText: 75
  Text: 86
  Title: 101


{'Text': 86, 'Title': 101, 'NarrativeText': 75, 'Footer': 10, 'ListItem': 4}

In [14]:
print("Strategy: FAST (default)")
print("="*60)

start_time = time.time()
element_fast = partition(
    filename=pdf_path,
    strategy="fast"
)
elapsed_time = time.time()-start_time

print(f"Time taken: {elapsed_time:.2f} seconds")
print(f"Elements extracted: {len(element_auto)}")

analyze_elements(element_auto)

Strategy: FAST (default)
Time taken: 1.76 seconds
Elements extracted: 276
Element Distribution: 
------------------------------
  Footer: 10
  ListItem: 4
  NarrativeText: 75
  Text: 86
  Title: 101


{'Text': 86, 'Title': 101, 'NarrativeText': 75, 'Footer': 10, 'ListItem': 4}

In [15]:
print("Strategy: HI_RES")
print("="*60)
print("Note: hi_res strategy uses ML models and may take longer")
print()

try:
    start_time = time.time()
    element_auto = partition(
        filename=pdf_path,
        strategy="hi_res"
    )
    elapsed_time = time.time()-start_time

    print(f"Time taken: {elapsed_time:.2f} seconds")
    print(f"Elements extracted: {len(element_auto)}")

    analyze_elements(element_auto)
except Exception as e:
    print(f"Error: {str(e)}")

Strategy: HI_RES
Note: hi_res strategy uses ML models and may take longer

Time taken: 56.05 seconds
Elements extracted: 341
Element Distribution: 
------------------------------
  FigureCaption: 2
  Footer: 2
  Header: 4
  Image: 24
  ListItem: 20
  NarrativeText: 58
  Table: 4
  Text: 202
  Title: 25


In [16]:
def show_first_elements(elements, n=5, strategy_name=""):
    print(f"\n{'='*60}")
    print(f"First {n} elements from {strategy_name} strategy:")
    print("="*60)

    for i, elem in enumerate(elements[:n]):
        elem_tp = type(elem).__name__
        text = elem.text[:150]+"..." if len(elem.text)>150 else elem.text
        print(f"[{i+1}] {elem_tp}")
        print(f"   {text}")

show_first_elements(element_auto, n=5, strategy_name="AUTO")
show_first_elements(element_fast, n=5, strategy_name="AUTO")


First 5 elements from AUTO strategy:
[1] Text
   4
[2] Text
   2024
[3] Text
   2
[4] Text
   0
[5] Text
   2

First 5 elements from AUTO strategy:
[1] Text
   4 2 0 2 c e D 9
[2] Title
   ] L C . s c [
[3] Text
   5 v 9 6 8 9 0 . 8 0 4 2 : v i X r a
[4] Title
   Docling Technical Report
[5] Title
   Version 1.0


In [17]:
from unstructured.partition.pdf import partition_pdf

pdf_element = partition_pdf(
    filename=pdf_path,
    strategy='fast'
)
print(f"PDF Elements: {len(pdf_element)}")
print(f"\nFirst 3 elements: ")
for elem in pdf_element[:3]:
    print(f"    - {type(elem).__name__}: {elem.text[:80]}...")
    

PDF Elements: 276

First 3 elements: 
    - Text: 4 2 0 2 c e D 9...
    - Title: ] L C . s c [...
    - Text: 5 v 9 6 8 9 0 . 8 0 4 2 : v i X r a...


In [18]:
pdf_elements = partition_pdf(
    filename=pdf_path,
    strategy='fast',
    include_page_breaks=True
)
tables = [e for e in pdf_elements if isinstance(e, Table)]
print(f"Found {len(tables)} tables in the PDF")

if tables:
    print(f"First table content: ")
    print(tables[0].text[:500])

Found 0 tables in the PDF


In [19]:
pages = {}
for elm in pdf_elements:
    page_num = elm.metadata.page_number if elm.metadata.page_number else 0
    if page_num not in pages:
        pages[page_num] = []
    pages[page_num].append(elm)
print(f"Documents has {len(pages)} pages")
for page_num in sorted(pages.keys())[:3]:
    print(f"\nPage {page_num}: {len(pages[page_num])} elements")

Documents has 10 pages

Page 0: 9 elements

Page 1: 15 elements

Page 2: 19 elements


In [20]:
html_elements = partition_html(
    filename="sample_documents\sample.html"
)
print(f"HTML Elements: {len(html_elements)}")
analyze_elements(html_elements)

HTML Elements: 20
Element Distribution: 
------------------------------
  ListItem: 8
  NarrativeText: 6
  Table: 1
  Title: 5


{'Title': 5, 'NarrativeText': 6, 'ListItem': 8, 'Table': 1}

In [21]:
try:
    url_element = partition_html(
        url="https://docling-project.github.io/docling/"
    )
    print(f"Element fetch from URL: {len(url_element)}")
    for elm in url_element[:5]:
        print(f"   - {type(elm).__name__}: {elm.text[:60]}...")
except Exception as e:
    print(f"Could not fetch URL :{e}")

Element fetch from URL: 54
   - Title: Documentation...
   - Image: Docling...
   - Image: DS4SD%2Fdocling | Trendshift...
   - Image: arXiv...
   - Image: PyPI version...


In [22]:
html_content = """
<html>
<body>
    <h1>Sample Document</h1>
    <p>This is a paragraph with some <strong>bold text</strong>.</p>
    <ul>
        <li>Item 1</li>
        <li>Item 2</li>
        <li>Item 3</li>
    </ul>
    <table>
        <tr><th>Name</th><th>Value</th></tr>
        <tr><td>A</td><td>100</td></tr>
        <tr><td>B</td><td>200</td></tr>
    </table>
</body>
</html>
"""
elements_from_string = partition_html(
    text=html_content
)
print(f"Elements from string: {len(elements_from_string)}")
for elem in elements_from_string:
    print(f"  - {type(elem).__name__}: {elem.text}")

Elements from string: 6
  - Title: Sample Document
  - NarrativeText: This is a paragraph with some bold text.
  - ListItem: Item 1
  - ListItem: Item 2
  - ListItem: Item 3
  - Table: Name Value A 100 B 200


In [23]:
md_elements = partition_md(filename="sample_documents\sample.md")

print(f"Markdown Elements: {len(md_elements)}")
analyze_elements(md_elements)

Markdown Elements: 44
Element Distribution: 
------------------------------
  ListItem: 15
  NarrativeText: 12
  Table: 2
  Text: 1
  Title: 14


{'Title': 14, 'NarrativeText': 12, 'ListItem': 15, 'Table': 2, 'Text': 1}

In [24]:
print("Markdown Structure:")
print("="*60)

for elem in md_elements[:15]:
    element_type = type(elem).__name__
    text = elem.text[:80]+"..." if len(elem.text)>80 else elem.text
    print(f"{element_type:15} | {text}")

Markdown Structure:
Title           | Document Parsing Best Practices
NarrativeText   | A comprehensive guide to document parsing for RAG systems.
Title           | Table of Contents
ListItem        | Introduction
ListItem        | Supported Formats
ListItem        | Parsing Strategies
ListItem        | Integration Guide
Title           | Introduction
NarrativeText   | Document parsing is a critical component in modern AI applications. It enables t...
ListItem        | Build searchable knowledge bases
ListItem        | Create training datasets for machine learning
ListItem        | Enable semantic search and retrieval
ListItem        | Power question-answering systems
NarrativeText   | Note: The quality of document parsing directly impacts the performance of downst...
Title           | Supported Formats


In [25]:
md_string = """
# Main Title

This is an introduction paragraph.

## Section 1

Here's some content with:
- Bullet point 1
- Bullet point 2

## Section 2

```python
def hello():
    print("Hello, World!")
```

| Column A | Column B |
|----------|----------|
| Value 1  | Value 2  |
"""

md_string_elements = partition_md(text=md_string)

print(f"Elements from markdown string: {len(md_string_elements)}")
for elem in md_string_elements:
    print(f"  - {type(elem).__name__}: {elem.text[:60]}..." if len(elem.text) > 60 else f"  - {type(elem).__name__}: {elem.text}")

Elements from markdown string: 7
  - Title: Main Title
  - NarrativeText: This is an introduction paragraph.
  - Title: Section 1
  - NarrativeText: Here's some content with: - Bullet point 1 - Bullet point 2
  - Title: Section 2
  - Text: def hello():
    print("Hello, World!")
  - Table: Column A Column B Value 1 Value 2


In [26]:
try:
    from docx import Document as DocxDocument
    from docx.shared import Inches
    
    # Create a new document
    doc = DocxDocument()
    
    # Add content
    doc.add_heading('Sample Word Document', 0)
    doc.add_paragraph('This is a sample Word document created for testing Unstructured.')
    
    doc.add_heading('Section 1: Introduction', level=1)
    doc.add_paragraph('Unstructured is a powerful library for document parsing.')
    
    doc.add_heading('Section 2: Features', level=1)
    doc.add_paragraph('Key features include:')
    
    # Add a bulleted list
    doc.add_paragraph('Multiple file format support', style='List Bullet')
    doc.add_paragraph('OCR capabilities', style='List Bullet')
    doc.add_paragraph('Table extraction', style='List Bullet')
    
    # Add a simple table
    table = doc.add_table(rows=3, cols=2)
    table.style = 'Table Grid'
    cells = table.rows[0].cells
    cells[0].text = 'Feature'
    cells[1].text = 'Status'
    cells = table.rows[1].cells
    cells[0].text = 'PDF Support'
    cells[1].text = 'Available'
    cells = table.rows[2].cells
    cells[0].text = 'OCR'
    cells[1].text = 'Available'
    
    # Save the document
    docx_path = 'sample_documents/sample1.docx'
    doc.save(docx_path)
    print(f"Created sample DOCX at: {docx_path}")
    
except ImportError:
    print("python-docx not installed. Install with: pip install python-docx")
    docx_path = None

Created sample DOCX at: sample_documents/sample1.docx
