# Chunking Strategies

aka Document Transformers, aka Text Splitters

Inspired by [5 Levels of Text Splitting](https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb)

In [1]:
import os
from langchain_core.documents import Document
from dotenv import load_dotenv

from utils.loader import load_documents

loaded = load_dotenv()
data_dir = os.getenv('CHUNKING_BENCHMARK') or 'my_benchmark/'

documents = load_documents(data_dir)


In [2]:
from typing import Dict

split_chunks: Dict[str, Document] = {}

## For Unstructured Text

### Fixed-size chunking


In [None]:
from langchain_text_splitters import CharacterTextSplitter

CHUNK_SIZE = 1000
CHUNK_OVERLAP = 0
EXPERIMENT_NAME = "fixed-size-" + str(CHUNK_SIZE) + "-" + str(CHUNK_OVERLAP)

splitter = CharacterTextSplitter(
    separator="",
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
)

split_chunks[EXPERIMENT_NAME] = splitter.split_documents(documents)

for i, chunk in enumerate(split_chunks[EXPERIMENT_NAME]):
    chunk.metadata["id"] = i


### RecursiveTextCharacterSplitting


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

CHUNK_SIZE = 1000
CHUNK_OVERLAP = 0
EXPERIMENT_NAME = "recursive-" + str(CHUNK_SIZE) + "-" + str(CHUNK_OVERLAP)

splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
)

split_chunks[EXPERIMENT_NAME] = splitter.split_documents(documents)

for i, chunk in enumerate(split_chunks[EXPERIMENT_NAME]):
    chunk.metadata["id"] = i

### Semantic Chunking


In [3]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings

EXPERIMENT_NAME = "semantic-chunks"

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

splitter = SemanticChunker(embeddings, add_start_index=True)
split_chunks[EXPERIMENT_NAME] = splitter.split_documents(documents)

for i, chunk in enumerate(split_chunks[EXPERIMENT_NAME]):
    chunk.metadata["id"] = i

### Dense X: Propositional Chunking


## For Document Specific Splitting 

### Markdown

In [None]:
%%script echo skipping
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 0
EXPERIMENT_NAME = "markdown" + str(CHUNK_SIZE) + "-" + str(CHUNK_OVERLAP)

seperators = [
    # First, try to split along Markdown headings (starting with level 2)
    "\n#{1,6} ",
    # Note the alternative syntax for headings (below) is not handled here
    # Heading level 2
    # ---------------
    # End of code block
    "```\n",
    # Horizontal lines
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    # Note that this splitter doesn't handle horizontal lines defined
    # by *three or more* of ***, ---, or ___, but this is not handled
    "\n\n",
    "\n",
    " ",
    "",
]
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=seperators,
    length_function=len,
    add_start_index=True,
)
split_chunks[EXPERIMENT_NAME] = splitter.split_documents(documents)

for i, chunk in enumerate(split_chunks[EXPERIMENT_NAME]):
    chunk.metadata["id"] = i


## Save To File

In [4]:
from utils.loader import save_chunks
save_chunks(split_chunks, data_dir)