# Chunking Files with Unstructured

In [4]:
import os
import pathlib
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
from unstructured.chunking.basic import chunk_elements
from unstructured.documents.elements import Header, Footer, Image


load_dotenv(find_dotenv())

DATA_PATH = os.getenv("DATA_PATH")
POPPLER_PATH = os.getenv("POPPLER_PATH")
TESSERACT_PATH = os.getenv("TESSERACT_PATH")
REMOVABLE_ELEMENTS=(Image)


def get_path(name: str) -> pathlib.WindowsPath:
    """ Create a path object for a file in the data directory

    Args:
        name (str): File name or directory name to search for

    Returns:
        pathlib.WindowsPath: Path object for the file or directory
    """
    return next(Path(DATA_PATH).rglob(name))


def pretty_output(chunks, mode: str):
    if mode == "elements":
        for i, chunk in enumerate(chunks, 1):
            print(f"Chunk {i}:")
            print(chunk.text)
            print("-" * 120)
            
    elif mode == "documents":
        for i, chunk in enumerate(chunks, 1):
            print(f"Chunk {i}:")
            print(chunk.page_content)
            print("-" * 120)

In [2]:
file_name = [
    "ark_040_-_foerderung_der_berufsausbildung.pdf",
    "ark_021_-_geschaeftsordnung_des_beirats.pdf"
]
path = get_path(file_name[-1])

In [5]:
ele = partition_pdf(
    filename=path,
    languages=["deu"],
    strategy="hi_res",
)

# Delete elements that are not required like Header, Footer, Image
# So we have a reduced list of elements with the informationen that are useful
red_ele = [file_element for file_element in ele if not isinstance(file_element, REMOVABLE_ELEMENTS)]

## Chunk `basic`

In [18]:
# Chunker 1
max_characters = 1500

chunks_1 = chunk_elements(
    red_ele,
    max_characters=max_characters,
)

len(chunks_1)

4

In [1]:
# pretty_output(chunks_1, "elements")

In [43]:
# Chunker 2
max_characters = 5000
new_after_n_chars = 1500

chunks_2 = chunk_elements(
    red_ele,
    max_characters=max_characters,
    new_after_n_chars=new_after_n_chars,
    overlap=int(new_after_n_chars*(1/5)),
    overlap_all=True,
)

len(chunks_2)

4

In [None]:
# pretty_output(chunks_2, "elements")

## Chunk `by_title`

In [7]:
# Chunker 1
max_characters = 1500

chunks_1 = chunk_by_title(
    elements=ele,
    max_characters=max_characters,
)

len(chunks_1)

4

In [19]:
# pretty_output(chunks_1, mode="elements")

In [16]:
# Chunker 2
max_characters = 5000
new_after_n_chars = 1500
combine_text_under_n_chars_multiplier=int(new_after_n_chars*(2/3))

chunks_2 = chunk_by_title(
    elements=ele,
    max_characters=max_characters,
    combine_text_under_n_chars=combine_text_under_n_chars_multiplier,
    new_after_n_chars=new_after_n_chars,
)

len(chunks_2)

3

In [18]:
# pretty_output(chunks_2, mode="elements")