# Chunking Strategies

aka Document Transformers, aka Text Splitters

In [1]:
import os
from langchain_core.documents import Document
from dotenv import load_dotenv

from utils.data_loader import load_documents

loaded = load_dotenv()
data_dir = os.getenv('CHUNKING_BENCHMARK_DATADIR') or 'data/'

documents = load_documents(data_dir)


In [2]:
from typing import Dict

split_chunks: Dict[str, Document] = {}

# For Unstructured Text

## Fixed-size chunking

In [38]:
from langchain_text_splitters import CharacterTextSplitter

CHUNK_SIZE = 512
CHUNK_OVERLAP = 200
EXPERIMENT_NAME = "fixed_size-" + str(CHUNK_SIZE) + "-" + str(CHUNK_OVERLAP)

splitter = CharacterTextSplitter(
    separator="",
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
)

split_chunks[EXPERIMENT_NAME] = splitter.split_documents(documents)

for i, chunk in enumerate(split_chunks[EXPERIMENT_NAME]):
    chunk.metadata["id"] = i


## RecursiveTextCharacterSplitting


In [31]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

CHUNK_SIZE = 512
CHUNK_OVERLAP = 200
EXPERIMENT_NAME = "recursive-" + str(CHUNK_SIZE) + "-" + str(CHUNK_OVERLAP)

splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
)

split_chunks[EXPERIMENT_NAME] = splitter.split_documents(documents)

for i, chunk in enumerate(split_chunks[EXPERIMENT_NAME]):
    chunk.metadata["id"] = i

In [21]:
split_chunks.keys()

dict_keys(['fixed-size-2048-0', 'fixed-size-1024-0', 'fixed-size-4096-0', 'fixed-size-1024-200', 'fixed-size-4096-200', 'fixed-size-2048-200', 'recursive-2048-0', 'recursive-1024-0', 'recursive-4096-0'])

## Semantic Chunking


In [24]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings

PERCENTILE = 90
EXPERIMENT_NAME = "semantic_chunks_" + str(PERCENTILE)

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

splitter = SemanticChunker(embeddings, breakpoint_threshold_amount=PERCENTILE)
split_chunks[EXPERIMENT_NAME] = splitter.split_documents(documents)

for i, chunk in enumerate(split_chunks[EXPERIMENT_NAME]):
    chunk.metadata["id"] = i

## Dense X: Propositional Chunking


# For Document Specific Splitting 

## Markdown

In [6]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

EXPERIMENT_NAME = "markdown_header"

splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=[("#", "h_1"), ("##", "h_2"), ("###", "h_3"), ("####", "h_4"), ("#####", "h_5"), ("######", "h_6")],
    strip_headers=False, 
)

md_header_splits = []
for document in documents:
    chunks = splitter.split_text(document.page_content)
    print(len(md_header_splits))
    for chunk in chunks:
        chunk.metadata["source"] = document.metadata["source"]
    md_header_splits.extend(chunks)

split_chunks[EXPERIMENT_NAME] = md_header_splits

for i, chunk in enumerate(split_chunks[EXPERIMENT_NAME]):
    chunk.metadata["id"] = i

0
22
28
42
59
74
86
97
107
121


In [23]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

EXPERIMENT_NAME = "markdown_header_parent"

splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=[("#", "h_1"), ("##", "h_2"), ("###", "h_3"), ("####", "h_4"), ("#####", "h_5"), ("######", "h_6")],
    strip_headers=False, # couldnt use this because for some reason it reduced the total number of chunks
)

md_header_splits = []
for document in documents:
    chunks = splitter.split_text(document.page_content)
    for chunk in chunks:
        chunk.metadata["source"] = document.metadata["source"]
        # prepend the parent header to the childs page_content for each chunk
        skipped_first = False
        for i in range(6, -1, -1): # iterate from h_6 to h_1
            if f"h_{i}" in chunk.metadata:
                if not skipped_first:
                    skipped_first = True
                    continue

                # need to also prepend the right amout of # to the header
                chunk.page_content = f"{'#' * i} {chunk.metadata[f'h_{i}']}\n{chunk.page_content}"

    md_header_splits.extend(chunks)


split_chunks[EXPERIMENT_NAME] = md_header_splits

for i, chunk in enumerate(split_chunks[EXPERIMENT_NAME]):
    chunk.metadata["id"] = i

# Save To File

In [39]:
from utils.data_loader import save_chunks
save_chunks(split_chunks, data_dir)