In [None]:
import os
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    SentenceTransformersTokenTextSplitter,
    TextSplitter,
    TokenTextSplitter,
)
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
import warnings

warnings.filterwarnings("ignore")

In [None]:
current_dir = os.getcwd()
books_dir = os.path.join(current_dir, "books", "romeo_and_juliet.txt")
db_dir = os.path.join(current_dir, "db")

In [None]:
if not os.path.exists(books_dir):
    raise FileNotFoundError(f"The file {books_dir} does not exist. Please check the path.")

loader = TextLoader(books_dir)
documents = loader.load()

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def create_vecotr_space(docs, store_name):

  persistan_directory = os.path.join(db_dir, store_name)

  if not os.path.exists(persistan_directory):
    vector_space = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        persist_directory=persistan_directory,
    )
    print(f"--- Finished creating vector store {store_name} ---")
  else:
    print(f"Vector store {store_name} already exists. No need to initialize.")

In [None]:
# 1. Character-based Splitting
# Splits text into chunks based on a specified number of characters.
# Useful for consistent chunk sizes regardless of content structure.
print("\n--- Using Character-based Splitting ---")
char_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
char_docs = char_splitter.split_documents(documents)
create_vecotr_space(char_docs, "chroma_db_char")




--- Using Character-based Splitting ---
--- Finished creating vector store chroma_db_char ---


In [None]:
# 2. Sentence-based Splitting
# Splits text into chunks based on sentences, ensuring chunks end at sentence boundaries.
# Ideal for maintaining semantic coherence within chunks.
print("\n--- Using Sentence-based Splitting ---")
sentence_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=100)
sentence_docs = sentence_splitter.split_documents(documents)
create_vecotr_space(sentence_docs, "chroma_db_sentence")


--- Using Sentence-based Splitting ---


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

--- Finished creating vector store chroma_db_sentence ---


In [None]:
# 3. Token-based Splitting
# Splits text into chunks based on tokens (words or subwords), using tokenizers like GPT-2.
# Useful for transformer models with strict token limits.
print("\n--- Using Token-based Splitting ---")
token_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=0)
token_docs = token_splitter.split_documents(documents)
create_vecotr_space(token_docs, "chroma_db_token")


--- Using Token-based Splitting ---
--- Finished creating vector store chroma_db_token ---


In [None]:
# 4. Recursive Character-based Splitting
# Attempts to split text at natural boundaries (sentences, paragraphs) within character limit.
# Balances between maintaining coherence and adhering to character limits.
print("\n--- Using Recursive Character-based Splitting ---")
recursive_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
recursive_docs = recursive_splitter.split_documents(documents)
create_vecotr_space(recursive_docs, "chroma_db_recursive")


--- Using Recursive Character-based Splitting ---
--- Finished creating vector store chroma_db_recursive ---


In [None]:
# 5. Custom Splitting
# Allows creating custom splitting logic based on specific requirements.
# Useful for documents with unique structure that standard splitters can't handle.
print("\n--- Using Custom Splitting ---")
class CustomSplitter(TextSplitter):
  def split_text(self, text):
    return text.split("\n\n")

custom_splitter = CustomSplitter()
custom_docs = custom_splitter.split_documents(documents)
create_vecotr_space(custom_docs, "chroma_db_custom")


--- Using Custom Splitting ---
--- Finished creating vector store chroma_db_custom ---


In [None]:
# Function to query a vector store
def query_vector_store(store_name, query):
    persistent_directory = os.path.join(db_dir, store_name)
    if os.path.exists(persistent_directory):
        print(f"\n--- Querying the Vector Store {store_name} ---")
        db = Chroma(
            persist_directory=persistent_directory, embedding_function=embeddings
        )
        retriever = db.as_retriever(
            search_type="similarity_score_threshold",
            search_kwargs={"k": 1, "score_threshold": 0.1},
        )
        relevant_docs = retriever.invoke(query)
        # Display the relevant results with metadata
        print(f"\n--- Relevant Documents for {store_name} ---")
        for i, doc in enumerate(relevant_docs, 1):
            print(f"Document {i}:\n{doc.page_content}\n")
            if doc.metadata:
                print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")
    else:
        print(f"Vector store {store_name} does not exist.")

In [None]:
query = "How did Juliet die?"

In [None]:
# Query each vector store
query_vector_store("chroma_db_char", query)
query_vector_store("chroma_db_sent", query)
query_vector_store("chroma_db_token", query)
query_vector_store("chroma_db_rec_char", query)
query_vector_store("chroma_db_custom", query)


--- Querying the Vector Store chroma_db_char ---

--- Relevant Documents for chroma_db_char ---
Document 1:
FRIAR LAWRENCE.
I will be brief, for my short date of breath
Is not so long as is a tedious tale.
Romeo, there dead, was husband to that Juliet,
And she, there dead, that Romeo’s faithful wife.
I married them; and their stol’n marriage day
Was Tybalt’s doomsday, whose untimely death
Banish’d the new-made bridegroom from this city;
For whom, and not for Tybalt, Juliet pin’d.
You, to remove that siege of grief from her,
Betroth’d, and would have married her perforce
To County Paris. Then comes she to me,
And with wild looks, bid me devise some means
To rid her from this second marriage,
Or in my cell there would she kill herself.
Then gave I her, so tutored by my art,
A sleeping potion, which so took effect
As I intended, for it wrought on her
The form of death. Meantime I writ to Romeo
That he should hither come as this dire night
To help to take her from her borrow’d grave,
Bein

In [None]:
import shutil
shutil.make_archive('/content/db', 'zip', '/content/db')

'/content/db.zip'

In [None]:
from google.colab import files
files.download('/content/db.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>