## Text Splitting from Documents

In [None]:
from langchain_community.document_loaders   import PyPDFLoader
from langchain_community.document_loaders   import TextLoader
from langchain_text_splitters               import RecursiveCharacterTextSplitter, CharacterTextSplitter, HTMLHeaderTextSplitter, RecursiveJsonSplitter

In [None]:
pdfLoader = PyPDFLoader('data/attention.pdf')
pdfDocument = pdfLoader.load()
pdfDocument

## Recursively Split Text by Characters

In [None]:
chunkSize       = 500
chunkOverlap    = 50

textSplitter = RecursiveCharacterTextSplitter(chunk_size = chunkSize, chunk_overlap = chunkOverlap)
finalDoc = textSplitter.split_documents(pdfDocument)
finalDoc

In [None]:
print(finalDoc[0])

In [None]:
print(finalDoc[1])

In [None]:
textLoader = TextLoader('data/speech.txt')
docs = textLoader.load()

In [None]:
docs

In [None]:
speech  = ""

with open('data/speech.txt') as file:
    speech = file.read()

speech

In [None]:
textSplitter = RecursiveCharacterTextSplitter(chunk_size = 100, chunk_overlap = 0)
text = textSplitter.create_documents([speech])

In [None]:
print(text[0])
print(text[1])

## Character Text Splitter

In [None]:
charSplitter = CharacterTextSplitter(separator = '\n\n', chunk_size = 100, chunk_overlap = 10)
charSplitter.split_documents(docs)

## HTML Header Splitter

In [None]:
html_string = """
<!DOCTYPE html>
<html>
<body>
    <div>
        <h1>Foo</h1>
        <p>Some intro text about Foo.</p>
        <div>
            <h2>Bar main section</h2>
            <p>Some intro text about Bar.</p>
            <h3>Bar subsection 1</h3>
            <p>Some text about the first subtopic of Bar.</p>
            <h3>Bar subsection 2</h3>
            <p>Some text about the second subtopic of Bar.</p>
        </div>
        <div>
            <h2>Baz</h2>
            <p>Some text about Baz</p>
        </div>
        <br>
        <p>Some concluding text about Foo</p>
    </div>
</body>
</html>
"""

In [None]:
headersToSplitOn = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3")
]

htmlSplitter = HTMLHeaderTextSplitter(headersToSplitOn)
htmlSplits = htmlSplitter.split_text(html_string)
htmlSplits

In [None]:
url = "https://plato.stanford.edu/entries/goedel"

headersToSplitOn = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
    ("h4", "Header 4")
]

htmlSplitter = HTMLHeaderTextSplitter(headersToSplitOn)
htmlSplits   = htmlSplitter.split_text_from_url(url)
htmlSplits

## JSON Splitter

In [None]:
import json
import requests

jsonData = requests.get("https://api.smith.langchain.com/openapi.json").json()

In [None]:
jsonSplitter = RecursiveJsonSplitter(max_chunk_size = 300)
jsonChunks   = jsonSplitter.split_json(jsonData)
jsonChunks

In [67]:
for chunk in jsonChunks[:3]:
    print(chunk)

{'openapi': '3.1.0', 'info': {'title': 'LangSmith', 'version': '0.1.0'}, 'paths': {'/api/v1/sessions/{session_id}': {'get': {'tags': ['tracer-sessions'], 'summary': 'Read Tracer Session', 'description': 'Get a specific session.'}}}}
{'paths': {'/api/v1/sessions/{session_id}': {'get': {'operationId': 'read_tracer_session_api_v1_sessions__session_id__get', 'security': [{'API Key': []}, {'Tenant ID': []}, {'Bearer Auth': []}]}}}}
{'paths': {'/api/v1/sessions/{session_id}': {'get': {'parameters': [{'name': 'session_id', 'in': 'path', 'required': True, 'schema': {'type': 'string', 'format': 'uuid', 'title': 'Session Id'}}, {'name': 'include_stats', 'in': 'query', 'required': False, 'schema': {'type': 'boolean', 'default': False, 'title': 'Include Stats'}}, {'name': 'accept', 'in': 'header', 'required': False, 'schema': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'title': 'Accept'}}]}}}}


In [70]:
documents = jsonSplitter.create_documents(texts=[jsonData])

for doc in documents[:3]:
    print(doc)

page_content='{"openapi": "3.1.0", "info": {"title": "LangSmith", "version": "0.1.0"}, "paths": {"/api/v1/sessions/{session_id}": {"get": {"tags": ["tracer-sessions"], "summary": "Read Tracer Session", "description": "Get a specific session."}}}}'
page_content='{"paths": {"/api/v1/sessions/{session_id}": {"get": {"operationId": "read_tracer_session_api_v1_sessions__session_id__get", "security": [{"API Key": []}, {"Tenant ID": []}, {"Bearer Auth": []}]}}}}'
page_content='{"paths": {"/api/v1/sessions/{session_id}": {"get": {"parameters": [{"name": "session_id", "in": "path", "required": true, "schema": {"type": "string", "format": "uuid", "title": "Session Id"}}, {"name": "include_stats", "in": "query", "required": false, "schema": {"type": "boolean", "default": false, "title": "Include Stats"}}, {"name": "accept", "in": "header", "required": false, "schema": {"anyOf": [{"type": "string"}, {"type": "null"}], "title": "Accept"}}]}}}}'


In [71]:
text = jsonSplitter.split_text(jsonData)
print(text[0])
print(text[1])

{"openapi": "3.1.0", "info": {"title": "LangSmith", "version": "0.1.0"}, "paths": {"/api/v1/sessions/{session_id}": {"get": {"tags": ["tracer-sessions"], "summary": "Read Tracer Session", "description": "Get a specific session."}}}}
{"paths": {"/api/v1/sessions/{session_id}": {"get": {"operationId": "read_tracer_session_api_v1_sessions__session_id__get", "security": [{"API Key": []}, {"Tenant ID": []}, {"Bearer Auth": []}]}}}}
