In [1]:
from langchain.vectorstores import Chroma

In [2]:
persist_directory = 'db'

In [8]:
docs = [
    'We need to first load the blog post contents.',
    'Let’s go through the above code step-by-step to really understand what’s going on.',
    'In this guide we’ll build a QA app over as website.'
]

In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=10, chunk_overlap=7, add_start_index=True
)
all_splits = text_splitter.split_text(docs[0])

len(all_splits)

6

In [12]:
all_splits

['We need to', 'to first', 'load the', 'the blog', 'blog post', 'contents.']

In [19]:
from langchain_community.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [20]:
vectordb = Chroma.from_texts(texts=all_splits, 
                            embedding=hf,
                            persist_directory=persist_directory,
                            collection_name= 'testing_Collection',
                            metadatas= [{"source": 'fisrt doc' }]
                            )

In [21]:
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 6})

retrieved_docs = retriever.invoke("What we need?")

len(retrieved_docs)

6

In [24]:
retrieved_docs[0].page_content

'We need to'