In [1]:
import chromadb
from nlp_toolkit.embedders import SentenceEmbedder

# you need to create a chroma collection instance to interact with
client = chromadb.PersistentClient(path="./chroma_storage")

collection = client.get_or_create_collection(
    name="app", 
    embedding_function=SentenceEmbedder().make_chroma(),
    metadata={"hnsw:space": "cosine"})

*Now that you got a chroma collection, the next step is to perform crud operations based on that.*

In [4]:
from chroma_crud import ChromaCrud

crud = ChromaCrud(collection)

# each record in the collection is identified with a unique id
ids = crud.ids[:10]
ids

['281656475',
 '281796108',
 '281940292',
 '282614216',
 '282935706',
 '283619399',
 '283646709',
 '284035177',
 '284666222',
 '284736660']

*Get Specific Document*

In [3]:
doc = crud.random_doc()

print("Document metadata:")
print(doc.metadata)

print("\nDocument text: ")
print(doc.document[:300])


Document metadata:
{'label': 'unknown', 'track_name': 'Google Docs', 'update_time': '2023-11-30 17:38:29'}

Document text: 
Create, edit, and collaborate with others on documents from your iPod, iPhone, or iPad with the free Google Docs app. With Google Docs you can:

* Create new documents or edit any that were created on the web or on another device
* Share documents and work together with others in the same document a


*Get a list of random documents*

In [4]:
crud.random_docs()

[Document(id='773126820', embedding=None, metadata={'label': 'unknown', 'track_name': 'Voyages-sncf.com : book train and bus tickets', 'update_time': '2023-11-30 17:38:29'}, document='VOYAGES-SNCF, THE SIMPLEST WAY TO BOOK YOUR TRAIN TICKETS FOR FRANCE AND EUROPE AT THE BEST PRICE. \nBook, exchange and cancel your journeys from your mobile phone, via our App. Experts in rail services? N° 1 within travel for the past 15 years – providing increasingly fast, practical and personalized services. Find out why 10 million travelers prefer our app.\n\nEFFORTLESS TRAVEL AT THE BEST PRICE\n– Download the app for free\n– No need to sign up to check timetables or buy a train ticket \n– Book, exchange or cancel your trip in seconds\n– We select the best journey for you: quickest, cheapest and in absolute comfort\n– E-ticket, M-ticket... Find everything you need for your trip on your app\n– You can also place an option and pay later. \n– Create a customer account to book in just three clicks!\n- Fin

*Nature language query*

In [5]:
query_results = crud.query("I love cat, dog, monkey, pig, penguine, and elephant.", n_results = 3)

for doc in query_results:
    print(f"Distance: {doc.distance:.3f}, Texts: {doc.document[:100].strip()}...")

Embedding Batches: 100%|██████████| 1/1 [00:00<00:00, 75.50it/s]

Distance: 0.433, Texts: Do you know what a Malayan tapir is? Let’s go to the zoo to find out! Fix the animal signs, color a...
Distance: 0.471, Texts: **** As seen on Nick Jr. ****

Duggee's back and this time he's awarding the We Love Animals Badge....
Distance: 0.479, Texts: Up, Up and Away!  Join Milo in his hot air balloon as he travels around the globe discovering the wo...





*Do not add a single text into the colleciton, because there are metadatas.*

In [5]:
import pandas as pd


df = pd.DataFrame([['I love cat, dog, monkey, pig, penguine, and elephant', 'fake-app-wanghuan']]
                  ,columns = ['text', "track_name"])

crud.insert_dataframe_batch(df,doc_col = 'text')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['create_time'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['update_time'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")


Inserting batch 1/1
{'documents': ['I love cat, dog, monkey, pig, penguine, and elephant'], 'metadatas': [{'track_name': 'fake-app-wanghuan', 'create_time': '2023-12-07 00:33:32', 'update_time': '2023-12-07 00:33:32'}], 'ids': ['b62e112b-65b7-4ad1-9868-6d43d621f55b']}


Embedding Batches: 100%|██████████| 1/1 [00:00<00:00, 73.83it/s]

Successful added to collection
Successful added all data to collection





In [6]:
crud.query('I love cat, dog, monkey, pig, penguine, and elephant')

Embedding Batches: 100%|██████████| 1/1 [00:00<00:00, 80.54it/s]


[QueryResult(id='b62e112b-65b7-4ad1-9868-6d43d621f55b', embedding=[-0.019165895879268646, -0.0609896183013916, 0.09754499047994614, 0.03764575719833374, -0.012674485333263874, 0.03202783688902855, 0.018913721665740013, -0.011898900382220745, 0.061161305755376816, 0.07794494181871414, -0.030762644484639168, -0.12869510054588318, -0.040120963007211685, 0.0600183866918087, 0.06347758322954178, 0.01146537996828556, -0.008818129077553749, 0.0230452511459589, -0.005929619073867798, -0.0048666102811694145, -0.03962282836437225, -0.0029168156906962395, -0.0059008514508605, -0.043928369879722595, -0.10732017457485199, -0.004506606608629227, -0.024822592735290527, -0.0622762031853199, -0.035525161772966385, -0.06474976241588593, -0.029002990573644638, -0.013883523643016815, 0.01906241476535797, 0.05979575961828232, 0.001436125603504479, -0.0019121248042210937, 0.03027520701289177, -0.05479218810796738, 0.09656745940446854, 0.06961735337972641, 0.02365010976791382, 0.0015079984441399574, 0.052804

In [51]:
from chroma_crud import Document