In [1]:
import chromadb
import logging
import polars as pl

In [2]:
from smart_procurement.embeeders import MixedbreadEmbedder
from smart_procurement.embeeding_funcs import MixedbreadEmbeddingFunction
from smart_procurement.models.chroma_cc_model import CommodityCodesListChromaDB

  from tqdm.autonotebook import tqdm, trange


In [3]:
logging.basicConfig(
    format='%(asctime)s | %(levelname)s : %(message)s',
    level=logging.INFO,
    # stream=sys.stdout,
    force=True
)

In [4]:
EMB_MODEL_PATH = "../models/embeddings/mxbai-embed-large-v1"

# Adding Cached Data

In [5]:
client = await chromadb.AsyncHttpClient(host="localhost", port=8010)

2024-08-28 19:58:54,075 | INFO : Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-08-28 19:58:54,112 | INFO : Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-08-28 19:58:54,132 | INFO : HTTP Request: GET http://localhost:8010/api/v1/tenants/default_tenant "HTTP/1.1 200 OK"
2024-08-28 19:58:54,136 | INFO : HTTP Request: GET http://localhost:8010/api/v1/databases/default_database?tenant=default_tenant "HTTP/1.1 200 OK"


In [6]:
emb_model = MixedbreadEmbedder(EMB_MODEL_PATH) 

2024-08-28 19:58:59,724 | INFO : Use pytorch device_name: mps
2024-08-28 19:58:59,725 | INFO : Load pretrained SentenceTransformer: ../models/embeddings/mxbai-embed-large-v1
2024-08-28 19:59:00,432 | INFO : Model loaded from ../models/embeddings/mxbai-embed-large-v1


In [15]:
# await client.delete_collection("commodity_codes")

In [7]:
test_collection = await client.get_collection(
    "commodity_codes", 
    embedding_function=MixedbreadEmbeddingFunction(
        embedder=MixedbreadEmbedder(EMB_MODEL_PATH)
    ),
    # metadata={
    # "hnsw:space": "cosine",
    # "hnsw:construction_ef": 100,
    # }
)

2024-08-28 19:59:40,295 | INFO : Use pytorch device_name: mps
2024-08-28 19:59:40,296 | INFO : Load pretrained SentenceTransformer: ../models/embeddings/mxbai-embed-large-v1
2024-08-28 19:59:41,058 | INFO : Model loaded from ../models/embeddings/mxbai-embed-large-v1
2024-08-28 19:59:41,067 | INFO : HTTP Request: GET http://localhost:8010/api/v1/collections/commodity_codes?tenant=default_tenant&database=default_database "HTTP/1.1 200 OK"


In [19]:
await test_collection.add(
    ids=["1", "2"],
    metadatas=[{"id": 1}, {"id": 2}],
    documents=["A nice Cat", "An amazing Truck"],
)

2024-08-25 21:14:35,431 | INFO : Generating embeddings for 2 sentences with 1024 dimensions.
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.33it/s]
2024-08-25 21:14:35,608 | INFO : HTTP Request: GET http://localhost:8010/api/v1/pre-flight-checks "HTTP/1.1 200 OK"
2024-08-25 21:14:35,637 | INFO : HTTP Request: POST http://localhost:8010/api/v1/collections/4566882f-e6ee-44d5-bfeb-b8a54c68c877/add "HTTP/1.1 201 Created"


In [20]:
await test_collection.query(
    query_texts=["A crazy dog."], 
    include=["documents", "metadatas", "distances"], 
    n_results=1,
    )

2024-08-25 21:14:37,896 | INFO : Generating embeddings for 1 sentences with 1024 dimensions.
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.62it/s]
2024-08-25 21:14:38,015 | INFO : HTTP Request: POST http://localhost:8010/api/v1/collections/4566882f-e6ee-44d5-bfeb-b8a54c68c877/query "HTTP/1.1 200 OK"


{'ids': [['2']],
 'distances': [[0.5768009408184493]],
 'embeddings': None,
 'metadatas': [[{'id': 2}]],
 'documents': [['An amazing Truck']],
 'uris': None,
 'data': None,
 'included': ['documents', 'metadatas', 'distances']}

In [21]:
result = await test_collection.query(
    query_texts=["A Crazy Dog."], 
    include=["documents", "metadatas", "distances"], 
    n_results=1,
    )

2024-08-25 21:14:41,691 | INFO : Generating embeddings for 1 sentences with 1024 dimensions.
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.31it/s]
2024-08-25 21:14:41,812 | INFO : HTTP Request: POST http://localhost:8010/api/v1/collections/4566882f-e6ee-44d5-bfeb-b8a54c68c877/query "HTTP/1.1 200 OK"


In [22]:
result

{'ids': [['2']],
 'distances': [[0.5768009408184493]],
 'embeddings': None,
 'metadatas': [[{'id': 2}]],
 'documents': [['An amazing Truck']],
 'uris': None,
 'data': None,
 'included': ['documents', 'metadatas', 'distances']}

# Add Data to Vector Database

In [8]:
from pathlib import Path

In [17]:
vector_db_client = await chromadb.AsyncHttpClient(host="localhost", port=8010)
commodity_code_collection = await vector_db_client.get_collection(
    "commodity_codes", 
    embedding_function=MixedbreadEmbeddingFunction(
        embedder=MixedbreadEmbedder(EMB_MODEL_PATH)
    ),
    # metadata={
    # "hnsw:space": "cosine",
    # "hnsw:construction_ef": 100,
    # }
)

2024-08-28 20:02:22,676 | INFO : Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-08-28 20:02:22,678 | INFO : Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-08-28 20:02:22,687 | INFO : HTTP Request: GET http://localhost:8010/api/v1/tenants/default_tenant "HTTP/1.1 200 OK"
2024-08-28 20:02:22,696 | INFO : HTTP Request: GET http://localhost:8010/api/v1/databases/default_database?tenant=default_tenant "HTTP/1.1 200 OK"
2024-08-28 20:02:22,698 | INFO : Use pytorch device_name: mps
2024-08-28 20:02:22,699 | INFO : Load pretrained SentenceTransformer: ../models/embeddings/mxbai-embed-large-v1
2024-08-28 20:02:24,933 | INFO : Model loaded from ../models/embeddings/mxbai-embed-large-v1
2024-08-28 20:02:24,956 | INFO : HTTP Request: POST http://localhost:8010/api/v1/collections?tenant=default_tenant&database=default_database "HTTP/1.1 200 OK"


In [10]:
DATA_PATH = Path().absolute().parent.joinpath("data")
COMMODITY_CODE_FILE_PATH = DATA_PATH.joinpath("raw", "commodity_codes.csv")

In [11]:
cc_df = pl.read_csv(COMMODITY_CODE_FILE_PATH, separator="|")

In [12]:
test_cc_list= CommodityCodesListChromaDB.from_dict(cc_df.to_dicts())
cc_documents = test_cc_list.get_chroma_input_document()

In [18]:
await commodity_code_collection.add(**cc_documents)

2024-08-28 20:02:42,721 | INFO : Generating embeddings for 66 sentences with 1024 dimensions.
Batches: 100%|██████████| 3/3 [00:01<00:00,  2.93it/s]
2024-08-28 20:02:43,777 | INFO : HTTP Request: GET http://localhost:8010/api/v1/pre-flight-checks "HTTP/1.1 200 OK"
2024-08-28 20:02:43,877 | INFO : HTTP Request: POST http://localhost:8010/api/v1/collections/cbb7e641-d131-4f0d-bba2-8ab2f7e33b00/add "HTTP/1.1 201 Created"


In [19]:
await commodity_code_collection.count()

2024-08-28 20:03:00,344 | INFO : HTTP Request: GET http://localhost:8010/api/v1/collections/cbb7e641-d131-4f0d-bba2-8ab2f7e33b00/count "HTTP/1.1 200 OK"


66

In [33]:
sample_result = await commodity_code_collection.query(
    query_texts=["I want to buy keyboard for computer."], 
    include=["documents", "metadatas", "distances"], 
    n_results=3,
    )

2024-08-28 20:29:35,965 | INFO : Generating embeddings for 1 sentences with 1024 dimensions.
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.78it/s]
2024-08-28 20:29:36,105 | INFO : HTTP Request: POST http://localhost:8010/api/v1/collections/cbb7e641-d131-4f0d-bba2-8ab2f7e33b00/query "HTTP/1.1 200 OK"


In [34]:
sample_result

{'ids': [['201010', '201030', '201020']],
 'distances': [[0.41137122165110906, 0.43248707219675575, 0.4375777626139331]],
 'embeddings': None,
 'metadatas': [[{'l1': 20,
    'l1_desc': 'IT Goods and Services',
    'l2': 2010,
    'l2_desc': 'Computers and Peripherals'},
   {'l1': 20,
    'l1_desc': 'IT Goods and Services',
    'l2': 2010,
    'l2_desc': 'Computers and Peripherals'},
   {'l1': 20,
    'l1_desc': 'IT Goods and Services',
    'l2': 2010,
    'l2_desc': 'Computers and Peripherals'}]],
 'documents': [['Desktop Computers for office workstations',
   'Monitors and Display Screens for computing',
   'Laptops for mobile workstations']],
 'uris': None,
 'data': None,
 'included': ['documents', 'metadatas', 'distances']}

In [25]:
from pydantic import BaseModel

In [29]:
class QueryResultMetadata(BaseModel):
    l1: int
    l1_desc: str
    l2: int
    l2_desc: str

In [30]:
class QueryResult(BaseModel):
    id: int
    metadata: QueryResultMetadata
    distance: float
    document: str

In [39]:
def parse_query_result(response: dict) -> QueryResult:
    result = []
    for _id, metadata, distance, document in zip(
        response["ids"][0], 
        response["metadatas"][0], 
        response["distances"][0],
        response["documents"][0],
        ):
        query_result = QueryResult(
            id=_id,
            metadata=metadata,
            distance=distance,
            document=document,
        )
        result.append(query_result)
    return result

In [40]:
parse_query_result(sample_result)

[QueryResult(id=201010, metadata=QueryResultMetadata(l1=20, l1_desc='IT Goods and Services', l2=2010, l2_desc='Computers and Peripherals'), distance=0.41137122165110906, document='Desktop Computers for office workstations'),
 QueryResult(id=201030, metadata=QueryResultMetadata(l1=20, l1_desc='IT Goods and Services', l2=2010, l2_desc='Computers and Peripherals'), distance=0.43248707219675575, document='Monitors and Display Screens for computing'),
 QueryResult(id=201020, metadata=QueryResultMetadata(l1=20, l1_desc='IT Goods and Services', l2=2010, l2_desc='Computers and Peripherals'), distance=0.4375777626139331, document='Laptops for mobile workstations')]