In [1]:
import os
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    SentenceTransformersTokenTextSplitter,
    TextSplitter,
    TokenTextSplitter
)
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

In [3]:
file_path = "../4_rag/books/romeo_and_juliet.txt"
db_dir = "db"

In [5]:
# Check if the text file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(
        f"The file {file_path} does not exist. Please check the path."
    )

In [6]:
# Read the text content from the file
loader = TextLoader(file_path)
documents = loader.load()

# 1. Character-based Splitting
* Splits text into chunks based on a specified number of characters.
* Useful for consistent chunk sizes regardless of content structure.

In [None]:
print("\n--- Using Character-based Splitting ---")
char_splitter = CharacterTextSplitter(chunk_size=50, chunk_overlap=10)
char_docs = char_splitter.split_documents(documents)

Created a chunk of size 496, which is longer than the specified 49
Created a chunk of size 97, which is longer than the specified 49
Created a chunk of size 85, which is longer than the specified 49
Created a chunk of size 62, which is longer than the specified 49
Created a chunk of size 142, which is longer than the specified 49
Created a chunk of size 207, which is longer than the specified 49
Created a chunk of size 210, which is longer than the specified 49
Created a chunk of size 183, which is longer than the specified 49
Created a chunk of size 134, which is longer than the specified 49
Created a chunk of size 143, which is longer than the specified 49
Created a chunk of size 227, which is longer than the specified 49
Created a chunk of size 303, which is longer than the specified 49
Created a chunk of size 223, which is longer than the specified 49
Created a chunk of size 88, which is longer than the specified 49
Created a chunk of size 640, which is longer than the specified 49


--- Using Character-based Splitting ---


Created a chunk of size 87, which is longer than the specified 49
Created a chunk of size 63, which is longer than the specified 49
Created a chunk of size 138, which is longer than the specified 49
Created a chunk of size 198, which is longer than the specified 49
Created a chunk of size 144, which is longer than the specified 49
Created a chunk of size 1437, which is longer than the specified 49
Created a chunk of size 57, which is longer than the specified 49
Created a chunk of size 365, which is longer than the specified 49
Created a chunk of size 54, which is longer than the specified 49
Created a chunk of size 161, which is longer than the specified 49
Created a chunk of size 137, which is longer than the specified 49
Created a chunk of size 98, which is longer than the specified 49
Created a chunk of size 261, which is longer than the specified 49
Created a chunk of size 82, which is longer than the specified 49
Created a chunk of size 53, which is longer than the specified 49
C

In [22]:
print(f"""
Number of documents: {len(char_docs)}
1st of chunk len: {len(char_docs[0].page_content)}
{char_docs[0].page_content}
""")


Number of documents: 1100
1st of chunk len: 496
﻿The Project Gutenberg eBook of Romeo and Juliet
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.



# 2. Sentence-based Splitting
* Splits text into chunks based on sentences, ensuring chunks end at sentence boundaries.
* Ideal for maintaining semantic coherence within chunks.

In [16]:

print("\n--- Using Sentence-based Splitting ---")
sent_splitter = SentenceTransformersTokenTextSplitter(chunk_size=100)
sent_docs = sent_splitter.split_documents(documents)


--- Using Sentence-based Splitting ---


  from tqdm.autonotebook import tqdm, trange


In [24]:
print(f"""
Number of documents: {len(sent_docs)}
1st of chunk len: {len(sent_docs[0].page_content)}
{sent_docs[0].page_content}
end 20 char: {sent_docs[0].page_content[-20:]}
""")


Number of documents: 127
1st of chunk len: 1587
the project gutenberg ebook of romeo and juliet this ebook is for the use of anyone anywhere in the united states and most other parts of the world at no cost and with almost no restrictions whatsoever. you may copy it, give it away or re - use it under the terms of the project gutenberg license included with this ebook or online at www. gutenberg. org. if you are not located in the united states, you will have to check the laws of the country where you are located before using this ebook. title : romeo and juliet author : william shakespeare release date : november 1, 1998 [ ebook # 1513 ] most recently updated : june 27, 2023 language : english credits : the pg shakespeare team, a team of about twenty project gutenberg volunteers * * * start of the project gutenberg ebook romeo and juliet * * * the tragedy of romeo and juliet by william shakespeare contents the prologue. act i scene i. a public place. scene ii. a street. scene iii. roo

# 3. Token-based Splitting
* Splits text into chunks based on tokens (words or subwords), using tokenizers like GPT-2.
* Useful for transformer models with strict token limits.

In [23]:

print("\n--- Using Token-based Splitting ---")
token_splitter = TokenTextSplitter(chunk_overlap=0, chunk_size=64)
token_docs = token_splitter.split_documents(documents)


--- Using Token-based Splitting ---


In [25]:
print(f"""
Number of documents: {len(token_docs)}
1st of chunk len: {len(token_docs[0].page_content)}
{token_docs[0].page_content}
end 20 char: {token_docs[0].page_content[-20:]}
""")


Number of documents: 787
1st of chunk len: 257
﻿The Project Gutenberg eBook of Romeo and Juliet
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under
end 20 char: y or re-use it under



# 4. Recursive Character-based Splitting
* Attempts to split text at natural boundaries (sentences, paragraphs) within character limit.
* Balances between maintaining coherence and adhering to character limits.

In [27]:

print("\n--- Using Recursive Character-based Splitting ---")
rec_char_splitter = RecursiveCharacterTextSplitter(
    chunk_size=50, chunk_overlap=10)
rec_char_docs = rec_char_splitter.split_documents(documents)


--- Using Recursive Character-based Splitting ---


In [28]:
print(f"""
Number of documents: {len(rec_char_docs)}
1st of chunk len: {len(rec_char_docs[0].page_content)}
{rec_char_docs[0].page_content}
end 20 char: {rec_char_docs[0].page_content[-20:]}
""")


Number of documents: 4595
1st of chunk len: 48
﻿The Project Gutenberg eBook of Romeo and Juliet
end 20 char:  of Romeo and Juliet



# 5. Custom Splitting
* Allows creating custom splitting logic based on specific requirements.
* Useful for documents with unique structure that standard splitters can't handle.

In [29]:

print("\n--- Using Custom Splitting ---")


class CustomTextSplitter(TextSplitter):
    def split_text(self, text):
        # Custom logic for splitting text
        return text.split("\n\n")  # Example: split by paragraphs


custom_splitter = CustomTextSplitter()
custom_docs = custom_splitter.split_documents(documents)


--- Using Custom Splitting ---


In [30]:
print(f"""
Number of documents: {len(custom_docs)}
1st of chunk len: {len(custom_docs[0].page_content)}
{custom_docs[0].page_content}
end 20 char: {custom_docs[0].page_content[-20:]}
""")


Number of documents: 1165
1st of chunk len: 496
﻿The Project Gutenberg eBook of Romeo and Juliet
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.
end 20 char: re using this eBook.

