# Chunking Files with LangChain and Unstructured

In [78]:
import os
import pathlib
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from langchain_unstructured import UnstructuredLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader, UnstructuredPDFLoader


load_dotenv(find_dotenv())

DATA_PATH = os.getenv("DATA_PATH")
POPPLER_PATH = os.getenv("POPPLER_PATH")
TESSERACT_PATH = os.getenv("TESSERACT_PATH")

def get_full_path(name: str) -> pathlib.WindowsPath:
    """ Create a path object for a file in the data directory

    Args:
        name (str): File name or directory name to search for

    Returns:
        pathlib.WindowsPath: The first Path object for the file or directory
    """
    return next(Path(DATA_PATH).rglob(name))


def pretty_output(chunks, mode: str):
    if mode == "elements":
        for i, chunk in enumerate(chunks, 1):
            print(f"Chunk {i}:")
            print(chunk.text)
            print("-" * 120)
            
    elif mode == "documents":
        for i, chunk in enumerate(chunks, 1):
            print(f"Chunk {i}:")
            print(chunk.page_content)
            print("-" * 120)

In [79]:
file_name = [
    "ark_040_-_foerderung_der_berufsausbildung.pdf",
    "ark_021_-_geschaeftsordnung_des_beirats.pdf"
]
paths = [get_full_path(name) for name in file_name]
path = paths[-1]

### An Overview for possible parameters

**Parameters `partition_pdf`:**  
- `filename: Optional[str] = None,`
- `file: Optional[IO[bytes]] = None,`
- `include_page_breaks: bool = False,`
- `strategy: str = PartitionStrategy.AUTO,`
- `infer_table_structure: bool = False,`
- `ocr_languages: Optional[str] = None,  # changing to optional for deprecation`
- `languages: Optional[list[str]] = None,`
- `include_metadata: bool = True,  # used by decorator`
- `metadata_filename: Optional[str] = None,  # used by decorator`
- `metadata_last_modified: Optional[str] = None,`
- `chunking_strategy: Optional[str] = None,  # used by decorator`
- `hi_res_model_name: Optional[str] = None,`
- `extract_images_in_pdf: bool = False,`
- `extract_image_block_types: Optional[list[str]] = None,`
- `extract_image_block_output_dir: Optional[str] = None,`
- `extract_image_block_to_payload: bool = False,`
- `date_from_file_object: bool = False,`
- `starting_page_number: int = 1,`
- `extract_forms: bool = False,`
- `form_extraction_skip_tables: bool = True,`


**Parameters `chunk_by_title`:**
- `elements: Iterable[Element],`
- `*,`
- `combine_text_under_n_chars: Optional[int] = None,`
- `include_orig_elements: Optional[bool] = None,`
- `max_characters: Optional[int] = None,`
- `multipage_sections: Optional[bool] = None,`
- `new_after_n_chars: Optional[int] = None,`
- `overlap: Optional[int] = None,`
- `overlap_all: Optional[bool] = None,`

## All-in-one Loader 

In [None]:
# Chunker 1
max_characters = 1500

chunks_1 = UnstructuredLoader(
    file_path=path,
    languages=["deu"],
    strategy="hi_res",
    chunking_strategy="by_title",
    max_characters=max_characters,
).load()

len(chunks_1)

In [81]:
# pretty_output(chunks_1, "documents")

In [None]:
# Chunker 2
max_characters = 5000
new_after_n_chars = 1500
combine_text_under_n_chars_multiplier=int(new_after_n_chars*(2/3))

chunks_2 = UnstructuredLoader(
    file_path=path,
    languages=["deu"],
    strategy="hi_res",
    chunking_strategy="by_title",
    max_characters=max_characters,
    combine_text_under_n_chars=combine_text_under_n_chars_multiplier,
    new_after_n_chars=new_after_n_chars,
).load()

len(chunks_2)

In [83]:
# pretty_output(chunks_2, "documents")

## PDF Chunking

In [None]:
# Chunker 1
max_characters = 1500

chunks_1 = UnstructuredPDFLoader(
    file_path=path,
    mode="elements",
    languages=["deu"],
    strategy="hi_res",
    chunking_strategy="by_title",
    max_characters=max_characters,
).load()

len(chunks_1)

In [85]:
# pretty_output(chunks_1, "documents")

In [None]:
# Chunker 2
max_characters = 5000
new_after_n_chars = 1500
combine_text_under_n_chars_multiplier=int(new_after_n_chars*(2/3))

chunks_2 = UnstructuredPDFLoader(
    file_path=path,
    mode="elements",
    languages=["deu"],
    strategy="hi_res",
    chunking_strategy="by_title",
    max_characters=max_characters,
    combine_text_under_n_chars=combine_text_under_n_chars_multiplier,
    new_after_n_chars=new_after_n_chars,
).load()

len(chunks_2)

In [87]:
# pretty_output(chunks_2, "documents")

## Word Document Chunking

In [88]:
file_name = "aktive_leistungen_bei_darlehensweiser_passiver_leistungsgewaehrung.docx"
path = str(get_full_path(file_name))

In [89]:
# Chunker 1
max_characters = 1500

chunks_1 = UnstructuredWordDocumentLoader(
    file_path=path,
    mode="elements",
    languages=["deu"],
    strategy="hi_res",
    chunking_strategy="by_title",
    max_characters=max_characters,
).load()

len(chunks_1)

2

In [90]:
# pretty_output(chunks_1, "documents")

In [91]:
# Chunker 2
max_characters = 5000
new_after_n_chars = 1500
combine_text_under_n_chars_multiplier=int(new_after_n_chars*(2/3))

chunks_2 = UnstructuredWordDocumentLoader(
    file_path=path,
    mode="elements",
    languages=["deu"],
    strategy="hi_res",
    chunking_strategy="by_title",
    max_characters=max_characters,
    combine_text_under_n_chars=combine_text_under_n_chars_multiplier,
    new_after_n_chars=new_after_n_chars,
).load()

len(chunks_2)

2

In [92]:
# pretty_output(chunks_2, "documents")

In [93]:
chunks_3 = UnstructuredLoader(
    file_path=path,
    languages=["deu"],
    strategy="hi_res",
    chunking_strategy="by_title",
    max_characters=max_characters,
    combine_text_under_n_chars=combine_text_under_n_chars_multiplier,
    new_after_n_chars=new_after_n_chars,
).load()

len(chunks_3)

2

In [95]:
# pretty_output(chunks_3, "documents")