# Chunking Strategies

aka Document Transformers, aka Text Splitters

Inspired by [5 Levels of Text Splitting](https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb)

In [9]:
import os
from langchain_core.documents import Document
from dotenv import load_dotenv

from utils.loader import load_documents

loaded = load_dotenv()
data_dir = os.getenv('CHUNKING_BENCHMARK') or 'my_benchmark/'

documents = load_documents(data_dir)


In [13]:
from typing import Dict

split_chunks: Dict[str, Document] = {}

# For Unstructured Text

## Fixed-size chunking

In [60]:
from langchain_text_splitters import CharacterTextSplitter

CHUNK_SIZE = 512
CHUNK_OVERLAP = 0
EXPERIMENT_NAME = "fixed-size-" + str(CHUNK_SIZE) + "-" + str(CHUNK_OVERLAP)

splitter = CharacterTextSplitter(
    separator="",
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
)

split_chunks[EXPERIMENT_NAME] = splitter.split_documents(documents)

for i, chunk in enumerate(split_chunks[EXPERIMENT_NAME]):
    chunk.metadata["id"] = i


## RecursiveTextCharacterSplitting


In [62]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

CHUNK_SIZE = 512
CHUNK_OVERLAP = 0
EXPERIMENT_NAME = "recursive-" + str(CHUNK_SIZE) + "-" + str(CHUNK_OVERLAP)

splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
)

split_chunks[EXPERIMENT_NAME] = splitter.split_documents(documents)

for i, chunk in enumerate(split_chunks[EXPERIMENT_NAME]):
    chunk.metadata["id"] = i

In [21]:
split_chunks.keys()

dict_keys(['fixed-size-2048-0', 'fixed-size-1024-0', 'fixed-size-4096-0', 'fixed-size-1024-200', 'fixed-size-4096-200', 'fixed-size-2048-200', 'recursive-2048-0', 'recursive-1024-0', 'recursive-4096-0'])

## Semantic Chunking


In [5]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings

PERCENTILE = 95
EXPERIMENT_NAME = "semantic-chunks-" + str(PERCENTILE)

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

splitter = SemanticChunker(embeddings, breakpoint_threshold_amount=PERCENTILE)
split_chunks[EXPERIMENT_NAME] = splitter.split_documents(documents)

for i, chunk in enumerate(split_chunks[EXPERIMENT_NAME]):
    chunk.metadata["id"] = i

In [7]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

PERCENTILE = 95
CHUNK_SIZE = 512 * 4
CHUNK_OVERLAP = 200
EXPERIMENT_NAME = "semantic-chunks-" + str(PERCENTILE) + "-recursive-" + str(CHUNK_SIZE) + "-" + str(CHUNK_OVERLAP)

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

splitter = SemanticChunker(embeddings, breakpoint_threshold_amount=PERCENTILE)
semantic_splits = splitter.split_documents(documents)

splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
)
split_chunks[EXPERIMENT_NAME] = splitter.split_documents(semantic_splits)

for i, chunk in enumerate(split_chunks[EXPERIMENT_NAME]):
    chunk.metadata["id"] = i

## Dense X: Propositional Chunking


# For Document Specific Splitting 

## Markdown

In [11]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

EXPERIMENT_NAME = "markdown-header"

splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=[("#", "1"), ("#", "2"), ("#", "3"), ("#", "4"), ("#", "5"), ("#", "6")],
    strip_headers=False, 
)

md_header_splits = []
for document in documents:
    chunks = splitter.split_text(document.page_content)
    for chunk in chunks:
        chunk.metadata["source"] = document.metadata["source"]
    md_header_splits.extend(chunks)

split_chunks[EXPERIMENT_NAME] = md_header_splits

for i, chunk in enumerate(split_chunks[EXPERIMENT_NAME]):
    chunk.metadata["id"] = i

In [19]:
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

CHUNK_SIZE = 512
CHUNK_OVERLAP = 0
EXPERIMENT_NAME = "markdown-header-recursive-" + str(CHUNK_SIZE) + "-" + str(CHUNK_OVERLAP)

splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=[("#", "1"), ("#", "2"), ("#", "3"), ("#", "4"), ("#", "5"), ("#", "6")],
    strip_headers=False, 
)

md_header_splits = []
for document in documents:
    chunks = splitter.split_text(document.page_content)
    for chunk in chunks:
        chunk.metadata["source"] = document.metadata["source"]
    md_header_splits.extend(chunks)

splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
)

split_chunks[EXPERIMENT_NAME] = splitter.split_documents(md_header_splits)

for i, chunk in enumerate(split_chunks[EXPERIMENT_NAME]):
    chunk.metadata["id"] = i

# Save To File

In [20]:
from utils.loader import save_chunks
save_chunks(split_chunks, data_dir)