### Chroma
- Chroma is a AI-native open source vector database focused on developer productivity and happiness. Chroma is licensed under Apache 2.0

In [3]:
# building a sample vectordb
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [5]:
## Ingestion
loader = TextLoader("speech.txt")
data = loader.load()
data

[Document(metadata={'source': 'speech.txt'}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.\n\nJust because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, conduct our operations as belligerents without passion and ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.\n\nI have said nothing of the governments allied with the Imperial government of Germany because they have not made war

In [7]:
## Transformation
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 150, chunk_overlap = 30)
splits = text_splitter.split_documents(data)

In [9]:
## Embeddings
embeddings = OllamaEmbeddings(model = "gemma:2b")
vectordb = Chroma.from_documents(documents = splits, embedding=embeddings)
vectordb

<langchain_chroma.vectorstores.Chroma at 0x21a2cc99010>

In [10]:
## querying the chroma vectordb
query = "What is the main topic of the speech?"
results = vectordb.similarity_search(query)


In [11]:
results[0].page_content

'may be for them, for the time being, to believe that this is spoken from our hearts.'

In [12]:
## Saving and loading the chroma vectordb
vectordb = Chroma.from_documents(documents = splits, embedding=embeddings, persist_directory="chroma_db")

In [13]:
## Loading the saved chroma db form the particular directory
vectordb2 = Chroma(persist_directory="chroma_db", embedding_function = embeddings)
docs = vectordb2.similarity_search(query)
print(docs[0].page_content)

may be for them, for the time being, to believe that this is spoken from our hearts.


In [14]:
## Similarity Search with scores(Based on distance) lesser the score, better the result.
docs = vectordb.similarity_search_with_score(query)
docs

[(Document(id='7f074bb0-bd87-428d-87b0-e253fab7646e', metadata={'source': 'speech.txt'}, page_content='may be for them, for the time being, to believe that this is spoken from our hearts.'),
  2110.533203125),
 (Document(id='205b7249-a628-492c-a792-8d87f4c43a9f', metadata={'source': 'speech.txt'}, page_content='and safety to all nations and make the world itself at last free.'),
  2285.273681640625),
 (Document(id='92f38848-89b7-4df5-a621-69b24f2aa014', metadata={'source': 'speech.txt'}, page_content='ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.'),
  2435.908203125),
 (Document(id='36c0a5b1-4980-4fe2-8bff-8795cbcc80aa', metadata={'source': 'speech.txt'}, page_content='that gave her birth and happiness and the peace which she has treasured. God helping her, she can do no other.'),
  2438.9892578125)]

In [15]:
## Calling vector store db as retriever
retriever = vectordb.as_retriever()
retriever.invoke(query)[0].page_content

'may be for them, for the time being, to believe that this is spoken from our hearts.'