In [2]:
import chromadb

chroma_client = chromadb.Client(
       chromadb.config.Settings(
           persist_directory="chroma_db"  # or your chosen path
       )
   )

In [3]:
collection = chroma_client.get_or_create_collection("my_collection")

In [4]:
collection.add(
    ids=["id1", "id2"],
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges"
    ]
)

In [5]:
from pprint import pprint
results = collection.query(
    query_texts=["This is a query document about hawaii"], # Chroma will embed this for you
    n_results=2 # how many results to return
)
pprint(results)


{'data': None,
 'distances': [[1.0404009819030762, 1.2430799007415771]],
 'documents': [['This is a document about pineapple',
                'This is a document about oranges']],
 'embeddings': None,
 'ids': [['id1', 'id2']],
 'included': ['metadatas', 'documents', 'distances'],
 'metadatas': [[None, None]],
 'uris': None}


In [6]:
from pprint import pprint
results = collection.query(
    query_texts=["This is a query document about hawaii"], # Chroma will embed this for you
    n_results=2, # how many results to return
    where_document={'$contains': 'oranges'}
)
pprint(results)


{'data': None,
 'distances': [[1.2430799007415771]],
 'documents': [['This is a document about oranges']],
 'embeddings': None,
 'ids': [['id2']],
 'included': ['metadatas', 'documents', 'distances'],
 'metadatas': [[None]],
 'uris': None}


In [7]:
import polars as pl
articles = pl.read_csv("../data/articles.csv",encoding="ISO-8859-1").with_row_index(offset=1)
articles.head()

index,Article,Date,Heading,NewsType
u32,str,str,str,str
1,"""KARACHI: The Sindh government …","""1/1/2015""","""sindh govt decides to cut publ…","""business"""
2,"""HONG KONG: Asian markets start…","""1/2/2015""","""asia stocks up in new year tra…","""business"""
3,"""HONG KONG: Hong Kong shares o…","""1/5/2015""","""hong kong stocks open 0.66 per…","""business"""
4,"""HONG KONG: Asian markets tumbl…","""1/6/2015""","""asian stocks sink euro near ni…","""business"""
5,"""NEW YORK: US oil prices Monday…","""1/6/2015""","""us oil prices slip below 50 a …","""business"""


In [8]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access the API key
KEY = os.getenv("chroma_key_gemini")


In [9]:
import chromadb.utils.embedding_functions as embedding_functions
google_ef = embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key=KEY)

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
N = 50
articles = articles[:N]

In [11]:
article_texts = articles['Article'][:5].to_list()
embeddings = google_ef(article_texts)
print(embeddings)

[array([ 2.28775181e-02, -1.47130406e-02, -8.25118348e-02, -1.90091394e-02,
        6.67086095e-02,  2.35756710e-02, -2.52378862e-02, -7.83969089e-03,
        1.76077615e-02,  4.84106764e-02, -7.13819861e-02,  3.80274244e-02,
       -6.53837901e-03, -4.28667665e-03, -1.49283148e-02, -3.53866257e-02,
       -1.44082727e-02,  2.74097244e-03,  3.66329625e-02, -3.88631262e-02,
        1.41093396e-02,  7.14974152e-03, -3.23501625e-03, -2.57016495e-02,
        2.35526096e-02, -2.53432374e-02, -3.11801909e-03, -3.26811783e-02,
       -4.62496504e-02,  4.07364480e-02, -6.47775754e-02,  6.28354326e-02,
       -3.86065580e-02, -4.47612181e-02, -6.90162629e-02, -8.30812659e-03,
        7.29266647e-03, -2.53183488e-02,  1.15164574e-02,  4.56976593e-02,
       -2.93939320e-06,  2.11690739e-02, -2.01360174e-02,  1.36427861e-02,
        1.06683392e-02, -1.46592082e-03,  2.74357982e-02, -4.74812090e-02,
        3.35330218e-02, -3.42603475e-02,  3.23806442e-02, -2.65496522e-02,
        5.84780164e-02, 

In [12]:
article_texts

['KARACHI: The Sindh government has decided to bring down public transport fares by 7 per cent due to massive reduction in petroleum product prices by the federal government, Geo News reported.Sources said reduction in fares will be applicable on public transport, rickshaw, taxi and other means of traveling.Meanwhile, Karachi Transport Ittehad (KTI) has refused to abide by the government decision.KTI President Irshad Bukhari said the commuters are charged the lowest fares in Karachi as compare to other parts of the country, adding that 80pc vehicles run on Compressed Natural Gas (CNG). Bukhari said Karachi transporters will cut fares when decrease in CNG prices will be made.                        \n\n\n\n\n\n\n\n\n\n\n',
 'HONG KONG: Asian markets started 2015 on an upswing in limited trading on Friday, with mainland Chinese stocks surging in Hong Kong on speculation Beijing may ease monetary policy to boost slowing growth.Hong Kong rose 1.07 percent, closing 252.78 points higher at 2

In [13]:
article_texts = articles['Article'].to_list()
article_ids = [str(idx) for idx in articles['index'].to_list()]

collection = chroma_client.get_or_create_collection(
    'my_collection_gemini',
    embedding_function=google_ef
)
collection.add(
    ids=article_ids,
    documents=article_texts
)

In [14]:
results = collection.query(
    query_texts=["Some example query text"],
    n_results=3,
    include=["embeddings", "documents"]  # removed "ids"
)
print(results["embeddings"])
print(results["ids"])  # You can still access the IDs from the result

[array([[ 0.02259598, -0.00898135, -0.06737264, ...,  0.02555946,
        -0.02987304, -0.01639686],
       [ 0.01472761, -0.01379868, -0.0624574 , ...,  0.02302082,
        -0.01701631,  0.01186198],
       [-0.00172695, -0.05266044, -0.0444691 , ...,  0.00426582,
        -0.03253415, -0.01364502]], shape=(3, 768))]
[['3', '12', '36']]


In [17]:
import chromadb
print(chromadb.__version__)

1.0.15
