### Character Splitter

In [1]:
from langchain.document_loaders import PyPDFLoader
loader  =PyPDFLoader("D:\Github\DeepLake-Langchain\dataset\pdf-sample.pdf")
pages = loader.load_and_split()


In [2]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size = 50, chunk_overlap = 20)
texts = text_splitter.split_documents(pages)

print(texts[0])

print (f"You have {len(texts)} documents")

print ("Preview:")
print (texts[0].page_content)

page_content="Adobe Acrobat PDF Files\nAdobe® Portable Document Format (PDF) is a universal file format that preserves all\nof the fonts, formatting, colours and graphics of any source document, regardless ofthe application and platform used to create it.\nAdobe PDF is an ideal format for electronic document distribution as it overcomes the\nproblems commonly encountered with electronic file sharing.\n• Anyone, anywhere  can open a PDF file. All you need is the free Adobe Acrobat\nReader. Recipients of other file formats sometimes can't open files because they\ndon't have the applications used to create the documents.\n• PDF files always print correctly  on any printing device.\n• PDF files always display exactly  as created, regardless of fonts, software, and\noperating systems. Fonts, and graphics are not lost due to platform, software, and\nversion incompatibilities.\n• The free Acrobat Reader is easy to download and can be freely distributed by\nanyone.\n• Compact PDF files are sma

### Recursive Character Text Splitter

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open("D:\Github\DeepLake-Langchain\dataset\state_of_the_union.txt", encoding="unicode-escape") as f:
    sample_text = f.read()

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 50,
    chunk_overlap = 10,
    length_function = len,
)

In [5]:
texts = text_splitter.create_documents([sample_text])
print(texts)

[Document(page_content='Madam Speaker, Madam Vice President, our First', metadata={}), Document(page_content='our First Lady and Second Gentleman. Members of', metadata={}), Document(page_content='of Congress and the Cabinet. Justices of the', metadata={}), Document(page_content='of the Supreme Court. My fellow Americans.', metadata={}), Document(page_content='Last year COVID-19 kept us apart. This year we', metadata={}), Document(page_content='year we are finally together again.', metadata={}), Document(page_content='Tonight, we meet as Democrats Republicans and', metadata={}), Document(page_content='and Independents. But most importantly as', metadata={}), Document(page_content='as Americans.', metadata={}), Document(page_content='With a duty to one another to the American people', metadata={}), Document(page_content='people to the Constitution.', metadata={}), Document(page_content='And with an unwavering resolve that freedom will', metadata={}), Document(page_content='will always t

### NLTK Text Splitter

In [6]:
# Load a long document
with open('D:\Github\DeepLake-Langchain\dataset\state_of_the_union.txt', encoding= 'unicode_escape') as f:
    sample_text = f.read()

from langchain.text_splitter import NLTKTextSplitter
text_splitter = NLTKTextSplitter(chunk_size=500)


texts = text_splitter.split_text(sample_text)
print(texts)

['Madam Speaker, Madam Vice President, our First Lady and Second Gentleman.\n\nMembers of Congress and the Cabinet.\n\nJustices of the Supreme Court.\n\nMy fellow Americans.\n\nLast year COVID-19 kept us apart.\n\nThis year we are finally together again.\n\nTonight, we meet as Democrats Republicans and Independents.\n\nBut most importantly as Americans.\n\nWith a duty to one another to the American people to the Constitution.\n\nAnd with an unwavering resolve that freedom will always triumph over tyranny.', 'But most importantly as Americans.\n\nWith a duty to one another to the American people to the Constitution.\n\nAnd with an unwavering resolve that freedom will always triumph over tyranny.\n\nSix days ago, Russiaâ\x80\x99s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways.\n\nBut he badly miscalculated.\n\nHe thought he could roll into Ukraine and the world would roll over.\n\nInstead he met a wall of strength he n

In [7]:
print(len(texts))

124


### Spacy TextSplitter

In [8]:
from langchain.text_splitter import SpacyTextSplitter

# Load a long document
with open('D:\Github\DeepLake-Langchain\dataset\state_of_the_union.txt', encoding= 'unicode_escape') as f:
    sample_text = f.read()

# Instantiate the SpacyTextSplitter with the desired chunk size
text_splitter = SpacyTextSplitter(chunk_size=500, chunk_overlap=20)


# Split the text using SpacyTextSplitter
texts = text_splitter.split_text(sample_text)

# Print the first chunk
print(texts[0])

Madam Speaker, Madam Vice President, our First Lady and Second Gentleman.

Members of Congress and the Cabinet.

Justices of the Supreme Court.

My fellow Americans.  



Last year COVID-19 kept us apart.

This year we are finally together again. 



Tonight, we meet as Democrats Republicans and Independents.

But most importantly as Americans. 



With a duty to one another to the American people to the Constitution.


### MarkDown Text Splitter

In [9]:
from langchain.text_splitter import MarkdownTextSplitter
markdown_text = """
# 

# Welcome to My Blog!

## Introduction
Hello everyone! My name is **John Doe** and I am a _software developer_. I specialize in Python, Java, and JavaScript.

Here's a list of my favorite programming languages:

1. Python
2. JavaScript
3. Java

You can check out some of my projects on [GitHub](https://github.com).

## About this Blog
In this blog, I will share my journey as a software developer. I'll post tutorials, my thoughts on the latest technology trends, and occasional book reviews.

Here's a small piece of Python code to say hello:

\``` python
def say_hello(name):
    print(f"Hello, {name}!")

say_hello("John")
\```

Stay tuned for more updates!

## Contact Me
Feel free to reach out to me on [Twitter](https://twitter.com) or send me an email at johndoe@email.com.

"""

markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0)
docs = markdown_splitter.create_documents([markdown_text])

print(docs)

[Document(page_content='# \n\n# Welcome to My Blog!', metadata={}), Document(page_content='## Introduction', metadata={}), Document(page_content='Hello everyone! My name is **John Doe** and I am a _software developer_. I specialize in Python,', metadata={}), Document(page_content='Java, and JavaScript.', metadata={}), Document(page_content="Here's a list of my favorite programming languages:\n\n1. Python\n2. JavaScript\n3. Java", metadata={}), Document(page_content='You can check out some of my projects on [GitHub](https://github.com).', metadata={}), Document(page_content='## About this Blog', metadata={}), Document(page_content="In this blog, I will share my journey as a software developer. I'll post tutorials, my thoughts on", metadata={}), Document(page_content='the latest technology trends, and occasional book reviews.', metadata={}), Document(page_content="Here's a small piece of Python code to say hello:", metadata={}), Document(page_content='\\``` python\ndef say_hello(name):\n

### TokenText Splitter

In [10]:
from langchain.text_splitter import TokenTextSplitter

# Load a long document
with open('D:\Github\DeepLake-Langchain\dataset\state_of_the_union.txt', encoding= 'unicode_escape') as f:
    sample_text = f.read()

# Initialize the TokenTextSplitter with desired chunk size and overlap
text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=50)

# Split into smaller chunks
texts = text_splitter.split_text(sample_text)
print(texts[0])

Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  

Last year COVID-19 kept us apart. This year we are finally together again. 

Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. 

With a duty to one another to the American people to the Constitution. 

And with an
