In [13]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("data/speech.txt")
docs = loader.load()
docs

[Document(metadata={'source': 'data/speech.txt'}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.\n\nJust because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, conduct our operations as belligerents without passion and ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.\n\n…\n\nIt will be all the easier for us to conduct ourselves as belligerents in a high spirit of right and fairne

In [14]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
text_splitter.create_documents([doc.page_content for doc in docs])

[Document(metadata={}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.'),
 Document(metadata={}, page_content='Just because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, conduct our operations as belligerents without passion and ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.\n\n…'),
 Document(metadata={}, page_content='It will be all the easier for us to conduct ourselves as be

In [15]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=100, chunk_overlap=20)
splitted = text_splitter.split_documents(docs)

Created a chunk of size 470, which is longer than the specified 100
Created a chunk of size 347, which is longer than the specified 100
Created a chunk of size 668, which is longer than the specified 100
Created a chunk of size 982, which is longer than the specified 100
Created a chunk of size 789, which is longer than the specified 100


In [16]:
from langchain_text_splitters import HTMLHeaderTextSplitter

html_string = """"
html
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Fake Webpage</title>
</head>
<body>
    <h1>Welcome to My Fake Page</h1>
    <p>This is a completely made-up webpage for demonstration purposes. It's not connected to any real content or services.</p>
    <img src="https://via.placeholder.com/300x200" alt="Placeholder Image">
    <p>A nice placeholder image to fill the space.</p>
    <p>Here's some more text.  We can talk about anything, really. This is just to provide some more content to show how paragraphs 
work.</p>
    <p>You can visit <a href="https://www.example.com">Example Website</a> for more information. (This is a fake link!) </p>
    <h2>A Smaller Heading</h2>
    <ul>
        <li>Item 1</li>
        <li>Item 2</li>
        <li>Item 3</li>
    </ul>
    <ol>
    <li>First Step</li>
    <li>Second Step</li>
    </ol>
    <pre>
    This is preformatted text.
    It preserves spaces and line breaks.
    </pre>
</body>
</html>
"""

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
html_splitter_splits = html_splitter.split_text(html_string)
html_splitter_splits

[Document(metadata={'Header 1': 'Welcome to My Fake Page'}, page_content='Welcome to My Fake Page'),
 Document(metadata={'Header 1': 'Welcome to My Fake Page'}, page_content="This is a completely made-up webpage for demonstration purposes. It's not connected to any real content or services.  \nA nice placeholder image to fill the space.  \nHere's some more text.  We can talk about anything, really. This is just to provide some more content to show how paragraphs \nwork.  \nYou can visit for more information. (This is a fake link!)  \nExample Website"),
 Document(metadata={'Header 1': 'Welcome to My Fake Page', 'Header 2': 'A Smaller Heading'}, page_content='A Smaller Heading'),
 Document(metadata={'Header 1': 'Welcome to My Fake Page', 'Header 2': 'A Smaller Heading'}, page_content='Item 1  \nItem 2  \nItem 3  \nFirst Step  \nSecond Step  \nThis is preformatted text.\n    It preserves spaces and line breaks.')]

In [17]:
url = "https://plato.stanford.edu/entries/goedel/"
headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
    ("h4", "Header 4"),
]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splits = html_splitter.split_text_from_url(url)
html_header_splits

[Document(metadata={}, page_content='End container NOTE: Script required for drop-down button to work (mirrors).  \nEnd header wrapper End content End footer  \nEnd header  \nEnd navigation End search  \nStanford Encyclopedia of Philosophy  \nMenu  \nBrowse  \nTable of Contents  \nWhat\'s New  \nRandom Entry  \nChronological  \nArchives  \nAbout  \nEditorial Information  \nAbout the SEP  \nEditorial Board  \nHow to Cite the SEP  \nSpecial Characters  \nAdvanced Tools  \nContact  \nSupport SEP  \nSupport the SEP  \nPDFs for SEP Friends  \nMake a Donation  \nSEPIA for Libraries  \nBegin article sidebar End article sidebar NOTE: Article content must have two wrapper divs: id="article" and id="article-content" End article NOTE: article banner is outside of the id="article" div. End article-banner  \nEntry Navigation  \nEntry Contents  \nBibliography  \nAcademic Tools  \nFriends PDF Preview  \nAuthor and Citation Info  \nBack to Top  \nEnd article-content  \nBEGIN ARTICLE HTML #aueditable D

In [19]:
## JSON splitter
import json
import requests

json_data = requests.get("https://api.smith.langchain.com/openapi.json").json()

In [21]:
from langchain_text_splitters import RecursiveJsonSplitter

json_splitter = RecursiveJsonSplitter(max_chunk_size=300)
json_chunks = json_splitter.split_json(json_data)

In [25]:
_ = [print(chunk) for chunk in json_chunks[:3]]

{'openapi': '3.1.0', 'info': {'title': 'LangSmith', 'version': '0.1.0'}, 'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'tags': ['tracer-sessions'], 'summary': 'Get Tracing Project Prebuilt Dashboard', 'description': 'Get a prebuilt dashboard for a tracing project.'}}}}
{'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'operationId': 'get_tracing_project_prebuilt_dashboard_api_v1_sessions__session_id__dashboard_post', 'security': [{'API Key': []}, {'Tenant ID': []}, {'Bearer Auth': []}]}}}}
{'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'parameters': [{'name': 'session_id', 'in': 'path', 'required': True, 'schema': {'type': 'string', 'format': 'uuid', 'title': 'Session Id'}}, {'name': 'accept', 'in': 'header', 'required': False, 'schema': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'title': 'Accept'}}]}}}}


In [28]:
docs = json_splitter.create_documents(texts=[json_data])
_ = [print(doc) for doc in docs[:3]]

page_content='{"openapi": "3.1.0", "info": {"title": "LangSmith", "version": "0.1.0"}, "paths": {"/api/v1/sessions/{session_id}/dashboard": {"post": {"tags": ["tracer-sessions"], "summary": "Get Tracing Project Prebuilt Dashboard", "description": "Get a prebuilt dashboard for a tracing project."}}}}'
page_content='{"paths": {"/api/v1/sessions/{session_id}/dashboard": {"post": {"operationId": "get_tracing_project_prebuilt_dashboard_api_v1_sessions__session_id__dashboard_post", "security": [{"API Key": []}, {"Tenant ID": []}, {"Bearer Auth": []}]}}}}'
page_content='{"paths": {"/api/v1/sessions/{session_id}/dashboard": {"post": {"parameters": [{"name": "session_id", "in": "path", "required": true, "schema": {"type": "string", "format": "uuid", "title": "Session Id"}}, {"name": "accept", "in": "header", "required": false, "schema": {"anyOf": [{"type": "string"}, {"type": "null"}], "title": "Accept"}}]}}}}'


In [31]:
texts = json_splitter.split_text(json_data)
_ = [print(text) for text in texts[:3]]

{"openapi": "3.1.0", "info": {"title": "LangSmith", "version": "0.1.0"}, "paths": {"/api/v1/sessions/{session_id}/dashboard": {"post": {"tags": ["tracer-sessions"], "summary": "Get Tracing Project Prebuilt Dashboard", "description": "Get a prebuilt dashboard for a tracing project."}}}}
{"paths": {"/api/v1/sessions/{session_id}/dashboard": {"post": {"operationId": "get_tracing_project_prebuilt_dashboard_api_v1_sessions__session_id__dashboard_post", "security": [{"API Key": []}, {"Tenant ID": []}, {"Bearer Auth": []}]}}}}
{"paths": {"/api/v1/sessions/{session_id}/dashboard": {"post": {"parameters": [{"name": "session_id", "in": "path", "required": true, "schema": {"type": "string", "format": "uuid", "title": "Session Id"}}, {"name": "accept", "in": "header", "required": false, "schema": {"anyOf": [{"type": "string"}, {"type": "null"}], "title": "Accept"}}]}}}}


In [4]:
## Reading a PDF file
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("data/attention.pdf")
docs = loader.load()
docs

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-08-03T00:07:29+00:00', 'author': '', 'keywords': '', 'moddate': '2023-08-03T00:07:29+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlu

In [12]:
### How to recursively split text into characters
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
final_docs = text_splitter.create_documents([doc.page_content for doc in docs])
final_docs

[Document(metadata={}, page_content='Generative AI pornography or simply AI pornography is a digitally created pornography produced through generative artificial intelligence (AI) technologies. Unlike traditional pornography, which involves real actors and cameras, this content is synthesized entirely by AI algorithms. These algorithms, including Generative adversarial network (GANs) and text-to-image models, generate lifelike images, videos, or animations from textual descriptions or datasets.'),
 Document(metadata={}, page_content='== History =='),
 Document(metadata={}, page_content="imagery, SD's public release led to dedicated communities exploring both artistic and explicit content, sparking ethical debates over open-access AI and its use in adult media. By 2020, AI tools had advanced to generate highly realistic adult content, amplifying calls for regulation."),
 Document(metadata={}, page_content='=== AI-generated influencers ==='),
 Document(metadata={}, page_content='One appl

In [5]:
type(docs[0])

langchain_core.documents.base.Document

In [6]:
## Web based loader
import bs4
from langchain_community.document_loaders import WebBaseLoader

url = "https://lilianweng.github.io/posts/2023-06-23-agent/"
loader = WebBaseLoader(
    web_paths=[
        url,
    ],
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=(
                "post-title",
                "post-content",
                "post-header",
            )
        )
    ),
)
docs = loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [7]:
docs[0]

Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='\n\n      LLM Powered Autonomous Agents\n    \nDate: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng\n\n\nBuilding agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview#\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes

In [8]:
## Arxiv
from langchain_community.document_loaders import ArxivLoader

docs = ArxivLoader(query="1706.03762").load()
len(docs)

1

In [9]:
## Wikipedia
from langchain_community.document_loaders import WikipediaLoader

docs = WikipediaLoader(query="Generative AI", load_max_docs=2).load()

In [10]:
len(docs)
docs[0]

