In [3]:
import chromadb
import logging
import polars as pl
from uuid import uuid4

In [4]:
from smart_procurement.embeeders import MixedbreadEmbedder
from smart_procurement.embeeding_funcs import MixedbreadEmbeddingFunction
from smart_procurement.models.chroma_cc_model import CommodityCodesListChromaDB

  from tqdm.autonotebook import tqdm, trange


In [5]:
logging.basicConfig(
    format='%(asctime)s | %(levelname)s : %(message)s',
    level=logging.INFO,
    # stream=sys.stdout,
    force=True
)

In [6]:
EMB_MODEL_PATH = "../models/embeddings/mxbai-embed-large-v1"

# Adding Cached Data

In [7]:
client = await chromadb.AsyncHttpClient(host="localhost", port=8010)

2024-08-31 16:40:16,527 | INFO : Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-08-31 16:40:16,566 | INFO : Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-08-31 16:40:16,634 | INFO : HTTP Request: GET http://localhost:8010/api/v1/tenants/default_tenant "HTTP/1.1 200 OK"
2024-08-31 16:40:16,646 | INFO : HTTP Request: GET http://localhost:8010/api/v1/databases/default_database?tenant=default_tenant "HTTP/1.1 200 OK"


In [10]:
emb_model = MixedbreadEmbedder(EMB_MODEL_PATH)
emb_func = MixedbreadEmbeddingFunction(embedder=emb_model)

2024-08-31 16:41:22,913 | INFO : Use pytorch device_name: cuda
2024-08-31 16:41:22,914 | INFO : Load pretrained SentenceTransformer: ../models/embeddings/mxbai-embed-large-v1
2024-08-31 16:41:23,995 | INFO : Model loaded from ../models/embeddings/mxbai-embed-large-v1


In [15]:
# await client.delete_collection("commodity_codes")

In [7]:
test_collection = await client.get_collection(
    "commodity_codes", 
    embedding_function=MixedbreadEmbeddingFunction(
        embedder=MixedbreadEmbedder(EMB_MODEL_PATH)
    ),
    # metadata={
    # "hnsw:space": "cosine",
    # "hnsw:construction_ef": 100,
    # }
)

2024-08-28 19:59:40,295 | INFO : Use pytorch device_name: mps
2024-08-28 19:59:40,296 | INFO : Load pretrained SentenceTransformer: ../models/embeddings/mxbai-embed-large-v1
2024-08-28 19:59:41,058 | INFO : Model loaded from ../models/embeddings/mxbai-embed-large-v1
2024-08-28 19:59:41,067 | INFO : HTTP Request: GET http://localhost:8010/api/v1/collections/commodity_codes?tenant=default_tenant&database=default_database "HTTP/1.1 200 OK"


In [19]:
await test_collection.add(
    ids=["1", "2"],
    metadatas=[{"id": 1}, {"id": 2}],
    documents=["A nice Cat", "An amazing Truck"],
)

2024-08-25 21:14:35,431 | INFO : Generating embeddings for 2 sentences with 1024 dimensions.
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.33it/s]
2024-08-25 21:14:35,608 | INFO : HTTP Request: GET http://localhost:8010/api/v1/pre-flight-checks "HTTP/1.1 200 OK"
2024-08-25 21:14:35,637 | INFO : HTTP Request: POST http://localhost:8010/api/v1/collections/4566882f-e6ee-44d5-bfeb-b8a54c68c877/add "HTTP/1.1 201 Created"


In [20]:
await test_collection.query(
    query_texts=["A crazy dog."], 
    include=["documents", "metadatas", "distances"], 
    n_results=1,
    )

2024-08-25 21:14:37,896 | INFO : Generating embeddings for 1 sentences with 1024 dimensions.
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.62it/s]
2024-08-25 21:14:38,015 | INFO : HTTP Request: POST http://localhost:8010/api/v1/collections/4566882f-e6ee-44d5-bfeb-b8a54c68c877/query "HTTP/1.1 200 OK"


{'ids': [['2']],
 'distances': [[0.5768009408184493]],
 'embeddings': None,
 'metadatas': [[{'id': 2}]],
 'documents': [['An amazing Truck']],
 'uris': None,
 'data': None,
 'included': ['documents', 'metadatas', 'distances']}

In [21]:
result = await test_collection.query(
    query_texts=["A Crazy Dog."], 
    include=["documents", "metadatas", "distances"], 
    n_results=1,
    )

2024-08-25 21:14:41,691 | INFO : Generating embeddings for 1 sentences with 1024 dimensions.
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.31it/s]
2024-08-25 21:14:41,812 | INFO : HTTP Request: POST http://localhost:8010/api/v1/collections/4566882f-e6ee-44d5-bfeb-b8a54c68c877/query "HTTP/1.1 200 OK"


In [22]:
result

{'ids': [['2']],
 'distances': [[0.5768009408184493]],
 'embeddings': None,
 'metadatas': [[{'id': 2}]],
 'documents': [['An amazing Truck']],
 'uris': None,
 'data': None,
 'included': ['documents', 'metadatas', 'distances']}

# Add Data to Vector Database

In [8]:
from pathlib import Path

In [9]:
vector_db_client = await chromadb.AsyncHttpClient(host="localhost", port=8010)

2024-08-31 16:40:32,733 | INFO : Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-08-31 16:40:32,735 | INFO : Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-08-31 16:40:32,744 | INFO : HTTP Request: GET http://localhost:8010/api/v1/tenants/default_tenant "HTTP/1.1 200 OK"
2024-08-31 16:40:32,749 | INFO : HTTP Request: GET http://localhost:8010/api/v1/databases/default_database?tenant=default_tenant "HTTP/1.1 200 OK"


In [11]:
async def get_connection_by_name(name: str = "commodity_codes"):
    try:
        collection = await vector_db_client.get_collection(
            name, 
            embedding_function=emb_func
            )
    except Exception as e:
        logging.info(e)
        collection = await vector_db_client.get_or_create_collection(
            name, 
            embedding_function=emb_func,
            metadata={
            "hnsw:space": "cosine",
            "hnsw:construction_ef": 100,
            }
        )
    return collection

In [25]:
await vector_db_client.delete_collection("commodity_code")

2024-08-31 13:40:04,233 | INFO : HTTP Request: DELETE http://localhost:8010/api/v1/collections/commodity_code?tenant=default_tenant&database=default_database "HTTP/1.1 200 OK"


In [26]:
commodity_code_coll_client = await get_connection_by_name("commodity_codes")

2024-08-31 13:40:12,838 | INFO : Use pytorch device_name: cuda
2024-08-31 13:40:12,839 | INFO : Load pretrained SentenceTransformer: ../models/embeddings/mxbai-embed-large-v1
2024-08-31 13:40:13,669 | INFO : Model loaded from ../models/embeddings/mxbai-embed-large-v1
2024-08-31 13:40:13,684 | INFO : HTTP Request: GET http://localhost:8010/api/v1/collections/commodity_codes?tenant=default_tenant&database=default_database "HTTP/1.1 500 Internal Server Error"
2024-08-31 13:40:13,687 | INFO : {"error":"ValueError('Collection commodity_codes does not exist.')"}
2024-08-31 13:40:13,689 | INFO : Use pytorch device_name: cuda
2024-08-31 13:40:13,689 | INFO : Load pretrained SentenceTransformer: ../models/embeddings/mxbai-embed-large-v1
2024-08-31 13:40:14,777 | INFO : Model loaded from ../models/embeddings/mxbai-embed-large-v1
2024-08-31 13:40:14,799 | INFO : HTTP Request: POST http://localhost:8010/api/v1/collections?tenant=default_tenant&database=default_database "HTTP/1.1 200 OK"


In [16]:
DATA_PATH = Path().absolute().parent.joinpath("data")
COMMODITY_CODE_FILE_PATH = DATA_PATH.joinpath("raw", "commodity_codes.csv")

In [17]:
cc_df = pl.read_csv(COMMODITY_CODE_FILE_PATH, separator="|")

In [18]:
test_cc_list= CommodityCodesListChromaDB.from_dict(cc_df.to_dicts())
cc_documents = test_cc_list.get_chroma_input_document()

In [21]:
cc_documents["metadatas"]

[{'l1': 10,
  'l1_desc': 'Office Equipment',
  'l2': 1010,
  'l2_desc': 'Files and Stationery'},
 {'l1': 10,
  'l1_desc': 'Office Equipment',
  'l2': 1010,
  'l2_desc': 'Files and Stationery'},
 {'l1': 10,
  'l1_desc': 'Office Equipment',
  'l2': 1010,
  'l2_desc': 'Files and Stationery'},
 {'l1': 10,
  'l1_desc': 'Office Equipment',
  'l2': 1020,
  'l2_desc': 'Printers and Scanners'},
 {'l1': 10,
  'l1_desc': 'Office Equipment',
  'l2': 1020,
  'l2_desc': 'Printers and Scanners'},
 {'l1': 10,
  'l1_desc': 'Office Equipment',
  'l2': 1020,
  'l2_desc': 'Printers and Scanners'},
 {'l1': 10,
  'l1_desc': 'Office Equipment',
  'l2': 1030,
  'l2_desc': 'Office Furniture'},
 {'l1': 10,
  'l1_desc': 'Office Equipment',
  'l2': 1030,
  'l2_desc': 'Office Furniture'},
 {'l1': 10,
  'l1_desc': 'Office Equipment',
  'l2': 1030,
  'l2_desc': 'Office Furniture'},
 {'l1': 10,
  'l1_desc': 'Office Equipment',
  'l2': 1040,
  'l2_desc': 'Communication Devices'},
 {'l1': 10,
  'l1_desc': 'Office Equip

In [27]:
await commodity_code_coll_client.add(**cc_documents)

2024-08-31 13:40:23,030 | INFO : Generating embeddings for 66 sentences with 1024 dimensions.
Batches: 100%|██████████| 3/3 [00:00<00:00, 25.72it/s]
2024-08-31 13:40:23,233 | INFO : HTTP Request: POST http://localhost:8010/api/v1/collections/0da93b20-e59e-4cfa-a36c-12fa9c209179/add "HTTP/1.1 201 Created"


In [28]:
await commodity_code_coll_client.count()

2024-08-31 13:40:28,281 | INFO : HTTP Request: GET http://localhost:8010/api/v1/collections/0da93b20-e59e-4cfa-a36c-12fa9c209179/count "HTTP/1.1 200 OK"


66

In [29]:
sample_result = await commodity_code_coll_client.query(
    query_texts=["I want to buy keyboard for computer."], 
    include=["documents", "metadatas", "distances"], 
    n_results=3,
    )

2024-08-31 13:40:31,623 | INFO : Generating embeddings for 1 sentences with 1024 dimensions.
Batches: 100%|██████████| 1/1 [00:00<00:00, 76.59it/s]
2024-08-31 13:40:31,668 | INFO : HTTP Request: POST http://localhost:8010/api/v1/collections/0da93b20-e59e-4cfa-a36c-12fa9c209179/query "HTTP/1.1 200 OK"


In [30]:
sample_result

{'ids': [['201010', '201030', '201020']],
 'distances': [[0.4113712253310119, 0.4324870137500332, 0.43757759473900204]],
 'embeddings': None,
 'metadatas': [[{'l1': 20,
    'l1_desc': 'IT Goods and Services',
    'l2': 2010,
    'l2_desc': 'Computers and Peripherals'},
   {'l1': 20,
    'l1_desc': 'IT Goods and Services',
    'l2': 2010,
    'l2_desc': 'Computers and Peripherals'},
   {'l1': 20,
    'l1_desc': 'IT Goods and Services',
    'l2': 2010,
    'l2_desc': 'Computers and Peripherals'}]],
 'documents': [['Desktop Computers for office workstations',
   'Monitors and Display Screens for computing',
   'Laptops for mobile workstations']],
 'uris': None,
 'data': None,
 'included': ['documents', 'metadatas', 'distances']}

In [20]:
from pydantic import BaseModel

In [21]:
class QueryResultMetadata(BaseModel):
    l1: int
    l1_desc: str
    l2: int
    l2_desc: str

In [22]:
class QueryResult(BaseModel):
    id: int
    metadata: QueryResultMetadata
    distance: float
    document: str

In [23]:
def parse_query_result(response: dict) -> QueryResult:
    result = []
    for _id, metadata, distance, document in zip(
        response["ids"][0], 
        response["metadatas"][0], 
        response["distances"][0],
        response["documents"][0],
        ):
        query_result = QueryResult(
            id=_id,
            metadata=metadata,
            distance=distance,
            document=document,
        )
        result.append(query_result)
    return result

In [24]:
parse_query_result(sample_result)

[QueryResult(id=201010, metadata=QueryResultMetadata(l1=20, l1_desc='IT Goods and Services', l2=2010, l2_desc='Computers and Peripherals'), distance=0.4113712253310119, document='Desktop Computers for office workstations'),
 QueryResult(id=201030, metadata=QueryResultMetadata(l1=20, l1_desc='IT Goods and Services', l2=2010, l2_desc='Computers and Peripherals'), distance=0.4324870137500332, document='Monitors and Display Screens for computing'),
 QueryResult(id=201020, metadata=QueryResultMetadata(l1=20, l1_desc='IT Goods and Services', l2=2010, l2_desc='Computers and Peripherals'), distance=0.43757759473900204, document='Laptops for mobile workstations')]

In [35]:
await get_connection_by_name("cc_caching")

2024-08-31 15:20:42,127 | INFO : Use pytorch device_name: cuda
2024-08-31 15:20:42,128 | INFO : Load pretrained SentenceTransformer: ../models/embeddings/mxbai-embed-large-v1
2024-08-31 15:20:43,319 | INFO : Model loaded from ../models/embeddings/mxbai-embed-large-v1
2024-08-31 15:20:43,332 | INFO : HTTP Request: GET http://localhost:8010/api/v1/collections/cc_caching?tenant=default_tenant&database=default_database "HTTP/1.1 500 Internal Server Error"
2024-08-31 15:20:43,334 | INFO : {"error":"ValueError('Collection cc_caching does not exist.')"}
2024-08-31 15:20:43,335 | INFO : Use pytorch device_name: cuda
2024-08-31 15:20:43,336 | INFO : Load pretrained SentenceTransformer: ../models/embeddings/mxbai-embed-large-v1
2024-08-31 15:20:44,544 | INFO : Model loaded from ../models/embeddings/mxbai-embed-large-v1
2024-08-31 15:20:44,564 | INFO : HTTP Request: POST http://localhost:8010/api/v1/collections?tenant=default_tenant&database=default_database "HTTP/1.1 200 OK"


Collection(id=de74d45a-d354-4200-bcf5-068f066c952b, name=cc_caching)

# Insert Sample Data to Chromadb

In [12]:
metadata = [{'l1': 10,
  'l1_desc': 'Office Equipment',
  'l2': 1010,
  'l2_desc': 'Files and Stationery',
  'l3': 101010,
  'l3_desc': 'Pencil, Files, Envelopes for office equipment'},
 {'l1': 20,
  'l1_desc': 'IT Goods and Services',
  'l2': 2010,
  'l2_desc': 'Computers and Peripherals',
  'l3': 201010,
  'l3_desc': 'Desktop Computers for office workstations'},
 {'l1': 20,
  'l1_desc': 'IT Goods and Services',
  'l2': 2010,
  'l2_desc': 'Computers and Peripherals',
  'l3': 201030,
  'l3_desc': 'Monitors and Display Screens for computing'}]

query = "I want to buy keyboard and mouse for my computer."
id = str(uuid4())


In [13]:
caching_db_client = await get_connection_by_name("cc_caching")

2024-08-31 16:44:25,037 | INFO : HTTP Request: GET http://localhost:8010/api/v1/collections/cc_caching?tenant=default_tenant&database=default_database "HTTP/1.1 200 OK"


In [22]:
import json

In [23]:
json.dumps(metadata)

'[{"l1": 10, "l1_desc": "Office Equipment", "l2": 1010, "l2_desc": "Files and Stationery", "l3": 101010, "l3_desc": "Pencil, Files, Envelopes for office equipment"}, {"l1": 20, "l1_desc": "IT Goods and Services", "l2": 2010, "l2_desc": "Computers and Peripherals", "l3": 201010, "l3_desc": "Desktop Computers for office workstations"}, {"l1": 20, "l1_desc": "IT Goods and Services", "l2": 2010, "l2_desc": "Computers and Peripherals", "l3": 201030, "l3_desc": "Monitors and Display Screens for computing"}]'

In [24]:
await caching_db_client.add(
        ids=[str(uuid4())],
        metadatas=[{"response_data": json.dumps(metadata)}],
        documents=[query],
    )

2024-08-31 16:51:37,628 | INFO : Generating embeddings for 1 sentences with 1024 dimensions.
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.76it/s]
2024-08-31 16:51:38,004 | INFO : HTTP Request: GET http://localhost:8010/api/v1/pre-flight-checks "HTTP/1.1 200 OK"
2024-08-31 16:51:38,028 | INFO : HTTP Request: POST http://localhost:8010/api/v1/collections/de74d45a-d354-4200-bcf5-068f066c952b/add "HTTP/1.1 201 Created"


In [None]:
caching_db_client.add(
    
)