### How to recursively split text by characters

This text splitter is the recommended one for generic text. It is parameterized by a list of characters. It tries to split on them in order until the chunks are small enough. The default list is ["\n\n", "\n", " ", ""]. This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text.

How the text is split: by list of characters.
How the chunk size is measured: by number of characters.
Below we show example usage.

To obtain the string content directly, use `.split_text`.

In [1]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("/Users/amarmandal/Documents/coding/GenAI_Course/data/AgenticAI-v2.0.pdf")
data = loader.load()
data  # Display the first 1000 characters of the first page

[Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-04-03T18:36:52+00:00', 'title': 'AgenticAI-v2.0', 'moddate': '2025-04-03T18:36:40+00:00', 'keywords': 'DAGjmPTBGs4,BAEmsmap8Lg,0', 'author': 'monal singh', 'source': '/Users/amarmandal/Documents/coding/GenAI_Course/data/AgenticAI-v2.0.pdf', 'total_pages': 24, 'page': 0, 'page_label': '1'}, page_content='CI/CD &\nDEPLOYMENT\nMONITORING &\nCLOUD\nAGENTIC\nFRAMEWORKS'),
 Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-04-03T18:36:52+00:00', 'title': 'AgenticAI-v2.0', 'moddate': '2025-04-03T18:36:40+00:00', 'keywords': 'DAGjmPTBGs4,BAEmsmap8Lg,0', 'author': 'monal singh', 'source': '/Users/amarmandal/Documents/coding/GenAI_Course/data/AgenticAI-v2.0.pdf', 'total_pages': 24, 'page': 1, 'page_label': '2'}, page_content='This course is designed for AI developers, machine learning engineers, data scientists, and\nsoftware engineers looking to build expertise in agentic AI, multi

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
final_documents = text_splitter.split_documents(data)
final_documents

[Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-04-03T18:36:52+00:00', 'title': 'AgenticAI-v2.0', 'moddate': '2025-04-03T18:36:40+00:00', 'keywords': 'DAGjmPTBGs4,BAEmsmap8Lg,0', 'author': 'monal singh', 'source': '/Users/amarmandal/Documents/coding/GenAI_Course/data/AgenticAI-v2.0.pdf', 'total_pages': 24, 'page': 0, 'page_label': '1'}, page_content='CI/CD &\nDEPLOYMENT\nMONITORING &\nCLOUD\nAGENTIC\nFRAMEWORKS'),
 Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-04-03T18:36:52+00:00', 'title': 'AgenticAI-v2.0', 'moddate': '2025-04-03T18:36:40+00:00', 'keywords': 'DAGjmPTBGs4,BAEmsmap8Lg,0', 'author': 'monal singh', 'source': '/Users/amarmandal/Documents/coding/GenAI_Course/data/AgenticAI-v2.0.pdf', 'total_pages': 24, 'page': 1, 'page_label': '2'}, page_content='This course is designed for AI developers, machine learning engineers, data scientists, and\nsoftware engineers looking to build expertise in agentic AI, multi

# CharacterTextSplitter


In [4]:
from langchain_text_splitters import CharacterTextSplitter
character_text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=100,
    chunk_overlap=20
)

final_documents = character_text_splitter.split_documents(data)
final_documents 

[Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-04-03T18:36:52+00:00', 'title': 'AgenticAI-v2.0', 'moddate': '2025-04-03T18:36:40+00:00', 'keywords': 'DAGjmPTBGs4,BAEmsmap8Lg,0', 'author': 'monal singh', 'source': '/Users/amarmandal/Documents/coding/GenAI_Course/data/AgenticAI-v2.0.pdf', 'total_pages': 24, 'page': 0, 'page_label': '1'}, page_content='CI/CD &\nDEPLOYMENT\nMONITORING &\nCLOUD\nAGENTIC\nFRAMEWORKS'),
 Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-04-03T18:36:52+00:00', 'title': 'AgenticAI-v2.0', 'moddate': '2025-04-03T18:36:40+00:00', 'keywords': 'DAGjmPTBGs4,BAEmsmap8Lg,0', 'author': 'monal singh', 'source': '/Users/amarmandal/Documents/coding/GenAI_Course/data/AgenticAI-v2.0.pdf', 'total_pages': 24, 'page': 1, 'page_label': '2'}, page_content='This course is designed for AI developers, machine learning engineers, data scientists, and\nsoftware engineers looking to build expertise in agentic AI, multi

# HTML Splitter

In [9]:
from langchain_text_splitters import HTMLHeaderTextSplitter


doc = """
<!DOCTYPE html>
<html lang="en">
<head>
<title>CSS Template</title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">

</head>
<body>

<h2>CSS Layout Float</h2>
<p>In this example, we have created a header, two columns/boxes and a footer. On smaller screens, the columns will stack on top of each other.</p>
<p>Resize the browser window to see the responsive effect (you will learn more about this in our next chapter - HTML Responsive.)</p>

<header>
  <h2>Cities</h2>
</header>

<section>
  <nav>
    <ul>
      <li><a href="#">London</a></li>
      <li><a href="#">Paris</a></li>
      <li><a href="#">Tokyo</a></li>
    </ul>
  </nav>
  
  <article>
    <h1>London</h1>
    <p>London is the capital city of England. It is the most populous city in the  United Kingdom, with a metropolitan area of over 13 million inhabitants.</p>
    <p>Standing on the River Thames, London has been a major settlement for two millennia, its history going back to its founding by the Romans, who named it Londinium.</p>
  </article>
</section>

<footer>
  <p>Footer</p>
</footer>

</body>
</html>


"""

headers_to_split = [
    ("h1", "Header 1"), 
    ("h2", "Header 2"),
]
html_header_splitter = HTMLHeaderTextSplitter(headers_to_split)

final_documents = html_header_splitter.split_text(doc)
final_documents

[Document(metadata={'Header 2': 'CSS Layout Float'}, page_content='CSS Layout Float'),
 Document(metadata={'Header 2': 'CSS Layout Float'}, page_content='In this example, we have created a header, two columns/boxes and a footer. On smaller screens, the columns will stack on top of each other.  \nResize the browser window to see the responsive effect (you will learn more about this in our next chapter - HTML Responsive.)'),
 Document(metadata={'Header 2': 'Cities'}, page_content='Cities'),
 Document(metadata={'Header 2': 'Cities'}, page_content='London  \nParis  \nTokyo'),
 Document(metadata={'Header 1': 'London'}, page_content='London'),
 Document(metadata={}, page_content='London is the capital city of England. It is the most populous city in the  United Kingdom, with a metropolitan area of over 13 million inhabitants.  \nStanding on the River Thames, London has been a major settlement for two millennia, its history going back to its founding by the Romans, who named it Londinium.  \n

# Json Splitter

In [11]:
from langchain_text_splitters import RecursiveJsonSplitter
import json

import requests

# This is a large nested json object and will be loaded as a python dict
json_data = requests.get("https://api.smith.langchain.com/openapi.json").json()

splitter = RecursiveJsonSplitter(max_chunk_size=4)
json_chunks = splitter.split_json(json_data=json_data)
json_chunks

[{'openapi': '3.1.0', 'info': {'title': 'LangSmith'}},
 {'info': {'version': '0.1.0'},
  'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'tags': ['tracer-sessions']}}}},
 {'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'summary': 'Get Tracing Project Prebuilt Dashboard'}}}},
 {'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'description': 'Get a prebuilt dashboard for a tracing project.'}}}},
 {'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'operationId': 'get_tracing_project_prebuilt_dashboard_api_v1_sessions__session_id__dashboard_post'}}}},
 {'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'security': [{'API Key': []},
      {'Tenant ID': []},
      {'Bearer Auth': []}]}}}},
 {'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'parameters': [{'name': 'session_id',
       'in': 'path',
       'required': True,
       'schema': {'type': 'string', 'format': 'uuid', 'title': 'Session Id'}},
      {'