https://python.langchain.com/docs/integrations/vectorstores/

Chroma

Chroma is a AI-native open-source vector database focused on developer productivity and happiness. Chroma is licensed under Apache 2.0.

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_chroma import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = TextLoader("documents/sample.txt")
docs = loader.load()

###Splitting the documents into smaller chunks for better processing and embedding generation

text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
docs_split = text_splitter.split_documents(docs)

embeddings = OllamaEmbeddings(model="codellama")


In [5]:
chroma_db = Chroma.from_documents(docs_split, embeddings)
chroma_db

<langchain_chroma.vectorstores.Chroma at 0x10aa13c50>

In [14]:
### query it

query = "Attention mechanisms "
results = chroma_db.similarity_search(query)
results[0].page_content

'attention mechanisms'

In [15]:
### Saving the Chroma database for future use
vectordb = Chroma.from_documents(docs_split, embeddings, persist_directory="./chroma_db")

In [19]:
### Load from disk

db2 = Chroma(persist_directory="./chroma.db", embedding_function=embeddings)
docs = db2.similarity_search(query)
docs

[]

In [21]:
### Similarity Search with score

docs = vectordb.similarity_search_with_score(query)
docs

[(Document(id='0ed3d5c5-f36e-456b-ac87-02b62ae853af', metadata={'source': 'documents/sample.txt'}, page_content='attention mechanisms'),
  4035.410888671875),
 (Document(id='22838115-d035-4660-8045-e2f5b2fe8706', metadata={'source': 'documents/sample.txt'}, page_content='attention mechanisms'),
  4035.410888671875),
 (Document(id='a7bb790c-635f-4115-a126-430b7f6a962a', metadata={'source': 'documents/sample.txt'}, page_content='1 Introduction'),
  5381.8115234375),
 (Document(id='4e29abeb-9379-4412-a6b5-72428c686446', metadata={'source': 'documents/sample.txt'}, page_content='1 Introduction'),
  5381.8115234375)]

### Retriever Option

In [24]:
retriever = vectordb.as_retriever()
result_retrieve = retriever.invoke(query)[0].page_content
result_retrieve

'attention mechanisms'