#Document Specific Chunking Using LangChain

#✅ Objective
We’ll:

Load different documents (from string for simplicity)

Use custom chunking logic based on document content

Use langchain.text_splitter to chunk each document differently

In [4]:
!pip install langchain langchain-text-splitters

Collecting langchain-text-splitters
  Downloading langchain_text_splitters-1.0.0-py3-none-any.whl.metadata (2.6 kB)
Downloading langchain_text_splitters-1.0.0-py3-none-any.whl (33 kB)
Installing collected packages: langchain-text-splitters
Successfully installed langchain-text-splitters-1.0.0


#✅ Practical: Document-Specific Chunking with LangChain

In [5]:
from langchain_core.documents import Document
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter

# ---------------------------
# Step 1: Create sample documents with metadata
# ---------------------------
docs = [
    Document(
        page_content="Chapter 1: Intro\n" + "This is a long paragraph " * 10,
        metadata={"type": "book"}
    ),
    Document(
        page_content="Meeting Notes:\n- Discussed marketing\n- Planned Q3 goals",
        metadata={"type": "meeting_notes"}
    ),
    Document(
        page_content="Name, Age, Address\nJohn, 32, NY\nJane, 28, CA\nJake, 35, TX",
        metadata={"type": "csv_like"}
    )
]

#✅ Step 2: Define Custom Chunker Based on Document Type

In [8]:
def custom_chunk_document(doc: Document):
    doc_type = doc.metadata.get("type", "default")

    if doc_type == "book":
        # Use RecursiveCharacterTextSplitter for long narrative text
        splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
    elif doc_type == "meeting_notes":
        # Use newline split for bullet-style or notes
        splitter = CharacterTextSplitter(separator="\n", chunk_size=50, chunk_overlap=10)
    elif doc_type == "csv_like":
        # Use comma separator to split rows/entries
        splitter = CharacterTextSplitter(separator="\n", chunk_size=1, chunk_overlap=0)
    else:
        # Default to line-based split
        splitter = CharacterTextSplitter(separator="\n", chunk_size=100, chunk_overlap=20)

    return splitter.split_documents([doc])

#✅ Step 3: Apply Document-Specific Chunking

In [9]:
# Process each document with its own chunking strategy
chunked_docs = []
for doc in docs:
    chunks = custom_chunk_document(doc)
    chunked_docs.extend(chunks)

# Print results
for i, chunk in enumerate(chunked_docs):
    print(f"\n--- Chunk {i+1} ---")
    print(chunk.page_content)




--- Chunk 1 ---
Chapter 1: Intro

--- Chunk 2 ---
This is a long paragraph This is a long paragraph This is a long paragraph This is a long paragraph This is a long paragraph This is a long paragraph This is a long paragraph This is a long paragraph

--- Chunk 3 ---
This is a long paragraph This is a long paragraph This is a long paragraph This is a long paragraph

--- Chunk 4 ---
Meeting Notes:
- Discussed marketing

--- Chunk 5 ---
- Planned Q3 goals

--- Chunk 6 ---
Name, Age, Address

--- Chunk 7 ---
John, 32, NY

--- Chunk 8 ---
Jane, 28, CA

--- Chunk 9 ---
Jake, 35, TX
