In [1]:
import chromadb
import logging
import polars as pl

In [2]:
from smart_procurement.embeeders import MixedbreadEmbedder
from smart_procurement.embeeding_funcs import MixedbreadEmbeddingFunction
from smart_procurement.models.chroma_cc_model import CommodityCodesListChromaDB

  from tqdm.autonotebook import tqdm, trange


In [3]:
logging.basicConfig(
    format='%(asctime)s | %(levelname)s : %(message)s',
    level=logging.INFO,
    # stream=sys.stdout,
    force=True
)

In [4]:
EMB_MODEL_PATH = "../models/embeddings/mxbai-embed-large-v1"

In [5]:
emb_model = MixedbreadEmbedder(EMB_MODEL_PATH)

2024-08-25 01:07:25,279 | INFO : Use pytorch device_name: cuda
2024-08-25 01:07:25,280 | INFO : Load pretrained SentenceTransformer: ../models/embeddings/mxbai-embed-large-v1
2024-08-25 01:07:26,368 | INFO : Model loaded from ../models/embeddings/mxbai-embed-large-v1


# Adding Cached Data

In [6]:
client = await chromadb.AsyncHttpClient(host="localhost", port=8011)

2024-08-25 01:07:28,917 | INFO : Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


2024-08-25 01:07:28,955 | INFO : Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-08-25 01:07:29,019 | INFO : HTTP Request: GET http://localhost:8011/api/v1/tenants/default_tenant "HTTP/1.1 200 OK"
2024-08-25 01:07:29,029 | INFO : HTTP Request: GET http://localhost:8011/api/v1/databases/default_database?tenant=default_tenant "HTTP/1.1 200 OK"


In [10]:
# await client.delete_collection("test_collection")

In [8]:
test_collection = await client.get_or_create_collection(
    "test_collection", 
    embedding_function=MixedbreadEmbeddingFunction(
        embedder=MixedbreadEmbedder(EMB_MODEL_PATH)
    ),
    metadata={
    "hnsw:space": "cosine",
    "hnsw:construction_ef": 100,
    }
)

2024-08-25 01:08:18,861 | INFO : Use pytorch device_name: cuda
2024-08-25 01:08:18,862 | INFO : Load pretrained SentenceTransformer: ../models/embeddings/mxbai-embed-large-v1
2024-08-25 01:08:19,644 | INFO : Model loaded from ../models/embeddings/mxbai-embed-large-v1
2024-08-25 01:08:19,682 | INFO : HTTP Request: POST http://localhost:8011/api/v1/collections?tenant=default_tenant&database=default_database "HTTP/1.1 200 OK"


In [9]:
await test_collection.add(
    ids=["1", "2"],
    metadatas=[{"id": 1}, {"id": 2}],
    documents=["A nice Cat", "An amazing Truck"],
)

2024-08-25 01:19:35,463 | INFO : Generating embeddings for 2 sentences with 1024 dimensions.
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.85it/s]
2024-08-25 01:19:35,826 | INFO : HTTP Request: GET http://localhost:8011/api/v1/pre-flight-checks "HTTP/1.1 200 OK"
2024-08-25 01:19:35,844 | INFO : HTTP Request: POST http://localhost:8011/api/v1/collections/afffdf44-fafc-4807-8eae-b27419cb80da/add "HTTP/1.1 201 Created"


In [None]:
from pydantic import BaseModel

In [None]:
class SearchResult(BaseModel):


In [24]:
await test_collection.query(
    query_texts=["A crazy dog."], 
    include=["documents", "metadatas", "distances"], 
    n_results=1,
    )

2024-08-25 04:04:53,896 | INFO : Generating embeddings for 1 sentences with 1024 dimensions.
Batches: 100%|██████████| 1/1 [00:00<00:00, 36.58it/s]
2024-08-25 04:04:53,938 | INFO : HTTP Request: POST http://localhost:8011/api/v1/collections/afffdf44-fafc-4807-8eae-b27419cb80da/query "HTTP/1.1 200 OK"


{'ids': [['2']],
 'distances': [[0.5768009059142347]],
 'embeddings': None,
 'metadatas': [[{'id': 2}]],
 'documents': [['An amazing Truck']],
 'uris': None,
 'data': None,
 'included': ['documents', 'metadatas', 'distances']}

In [31]:
result = await test_collection.query(
    query_texts=["A Crazy Dog."], 
    include=["documents", "metadatas", "distances"], 
    n_results=1,
    )

2024-08-25 04:09:43,055 | INFO : Generating embeddings for 1 sentences with 1024 dimensions.
Batches: 100%|██████████| 1/1 [00:00<00:00, 39.82it/s]
2024-08-25 04:09:43,097 | INFO : HTTP Request: POST http://localhost:8011/api/v1/collections/afffdf44-fafc-4807-8eae-b27419cb80da/query "HTTP/1.1 200 OK"


In [32]:
result

{'ids': [['2']],
 'distances': [[0.5768009059142347]],
 'embeddings': None,
 'metadatas': [[{'id': 2}]],
 'documents': [['An amazing Truck']],
 'uris': None,
 'data': None,
 'included': ['documents', 'metadatas', 'distances']}

In [21]:
from pydantic import BaseModel

class SearchResult(BaseModel):
    doc_id: str
    distance: float
    metadata: dict

    @classmethod
    def from_chroma_response(cls, response: dict):
        doc_id = response["ids"][0][0]
        distance = response["distances"][0][0]
        metadata = response["metadatas"][0][0]
        return cls(doc_id=doc_id, distance=distance, metadata=metadata)
    
    def is_similar(self, threshold: float) -> bool:
        return self.distance <= threshold
    

In [26]:
search_result = SearchResult.from_chroma_response(result)

In [34]:
{"a": 1}.update({"b": 2})

In [28]:
search_result

SearchResult(doc_id='2', distance=0.5768009059142347, metadata={'id': 2})

In [27]:
search_result.is_similar(0.1)

False

# Add Data to Vector Database

In [16]:
from pathlib import Path

In [14]:
vector_db_client = await chromadb.AsyncHttpClient(host="localhost", port=8010)
commodity_code_collection = await vector_db_client.get_or_create_collection(
    "commodity_codes", 
    embedding_function=MixedbreadEmbeddingFunction(
        embedder=MixedbreadEmbedder(EMB_MODEL_PATH)
    ),
    metadata={
    "hnsw:space": "cosine",
    "hnsw:construction_ef": 100,
    }
)

2024-08-18 16:25:03,329 | INFO : Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-08-18 16:25:03,332 | INFO : Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-08-18 16:25:03,349 | INFO : HTTP Request: GET http://localhost:8010/api/v1/tenants/default_tenant "HTTP/1.1 200 OK"
2024-08-18 16:25:03,360 | INFO : HTTP Request: GET http://localhost:8010/api/v1/databases/default_database?tenant=default_tenant "HTTP/1.1 200 OK"
2024-08-18 16:25:03,363 | INFO : Use pytorch device_name: mps
2024-08-18 16:25:03,363 | INFO : Load pretrained SentenceTransformer: ../models/embeddings/mxbai-embed-large-v1
2024-08-18 16:25:06,225 | INFO : Model loaded from ../models/embeddings/mxbai-embed-large-v1
2024-08-18 16:25:06,279 | INFO : HTTP Request: POST http://localhost:8010/api/v1/collections?tenant=default_tenant&database=default_database "HTTP/1.1 200 OK"


In [22]:
DATA_PATH = Path().absolute().parent.joinpath("data")
COMMODITY_CODE_FILE_PATH = DATA_PATH.joinpath("raw", "commodity_codes.csv")

In [23]:
cc_df = pl.read_csv(COMMODITY_CODE_FILE_PATH, separator="|")

In [56]:
test_cc_list= CommodityCodesListChromaDB.from_dict(cc_df.to_dicts())
cc_documents = test_cc_list.get_chroma_input_document()

In [58]:
await commodity_code_collection.add(**cc_documents)

2024-08-18 18:49:04,202 | INFO : Generating embeddings for 66 sentences with 1024 dimensions.


Batches: 100%|██████████| 3/3 [00:02<00:00,  1.02it/s]
2024-08-18 18:49:07,193 | INFO : HTTP Request: GET http://localhost:8010/api/v1/pre-flight-checks "HTTP/1.1 200 OK"
2024-08-18 18:49:07,323 | INFO : HTTP Request: POST http://localhost:8010/api/v1/collections/c5eb333f-fd6f-40e1-8ab4-6dc873c773da/add "HTTP/1.1 201 Created"


In [59]:
await commodity_code_collection.count()

2024-08-18 18:49:30,595 | INFO : HTTP Request: GET http://localhost:8010/api/v1/collections/c5eb333f-fd6f-40e1-8ab4-6dc873c773da/count "HTTP/1.1 200 OK"


66

In [64]:
sample_result = await commodity_code_collection.query(
    query_texts=["I want to buy keyboard for computer."], 
    include=["documents"], 
    n_results=2,
    )

2024-08-18 19:11:49,314 | INFO : Generating embeddings for 1 sentences with 1024 dimensions.
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.12it/s]
2024-08-18 19:11:49,444 | INFO : HTTP Request: POST http://localhost:8010/api/v1/collections/c5eb333f-fd6f-40e1-8ab4-6dc873c773da/query "HTTP/1.1 200 OK"


In [65]:
sample_result

{'ids': [['201010', '201030']],
 'distances': None,
 'embeddings': None,
 'metadatas': None,
 'documents': [['Desktop Computers for office workstations',
   'Monitors and Display Screens for computing']],
 'uris': None,
 'data': None,
 'included': ['documents']}