<a href="https://colab.research.google.com/github/HayateSato/DS_Practice/blob/main/2024_11_15_ChromaDB_GetStarted.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG Storage: ChromaDB

Databases as backend:
- sqlite
- DuckDB
- ... PostGreSQL

In [4]:
# pip install chromadb

In [5]:
import chromadb
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [6]:
# Initialize a ChromaDB client
chroma_client = chromadb.Client()

In [7]:
# Load a pretrained Hugging Face model for the embeddings
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
# Create a collection
collection = chroma_client.get_or_create_collection(name="my-collection")

In [9]:
# documents
documents = [
    'This is a document about pineapple',
    'This is a document about oranges',
    'I like apples'
]

In [17]:
# Get the number of documents stored in the collection
num_documents = collection.count()

# Print the count
print(f"Number of documents in ChromaDB: {num_documents}")

Number of documents in ChromaDB: 3


In [10]:
# Generate embeddings using Hugging-Face model
embeddings = embedding_model.encode(documents)

In [11]:
embeddings

array([[-0.00709368,  0.06554618, -0.01166055, ...,  0.01568031,
         0.08966841,  0.01337308],
       [-0.02646883,  0.06878677, -0.03771435, ...,  0.06535887,
         0.07775757,  0.01558972],
       [-0.0288904 ,  0.00464453, -0.00530246, ...,  0.05547279,
         0.10815743,  0.01234411]], dtype=float32)

In [37]:
embeddings.shape

(3, 384)

In [12]:
# Upsert documents AND their embeddings into the ChromaDB collection
collection.upsert(
    documents=documents,
    embeddings=embeddings.tolist(),
    ids=['id1', 'id2', 'id3']
)

In [13]:
collection

Collection(name=my-collection)

In [30]:
query_text_pineapple = "A document discussing a tropical fruit"
query_embedding_pineapple = embedding_model.encode([query_text_pineapple])

In [31]:
# Perform a similarity research in ChromaDB
query_result_pineapple = collection.query(
    query_embeddings=query_embedding_pineapple.tolist(),
    n_results=1 # Return the top most similar results
)

In [32]:
print("Query results:")
for result in query_result_pineapple['documents'][0]:
    if not result:
        print("No results found")
    print(result)

Query results:
This is a document about pineapple


In [34]:
query_text = "Do not find an apple"
query_embedding = embedding_model.encode([query_text])

In [35]:
# Perform a similarity research in ChromaDB
query_result = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=1 # Return the top most similar results
)

In [36]:
print("Query results:")
for result in query_result['documents'][0]:
    if not result:
        print("No results found")
    print(result)

Query results:
I like apples
