<a href="https://colab.research.google.com/github/Mishti-05/KGandVectorDBProject/blob/master/Using_ChromaDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install chromadb


Collecting chromadb
  Downloading chromadb-1.0.9-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-4.0.1-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.33.0-py3-none-any.whl.metadata (2.5 kB)
Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)
  Downloading opentelemetry_instrumentation_fastapi-0.54b0-py3-none-any.whl.metadata (2.2 kB)
Collecting pypika>=0.48.9 (from 

In [5]:
import pandas as pd
import json
import numpy as np
from pathlib import Path
from tqdm import tqdm
import os
import chromadb
from chromadb.config import Settings

# Initialize Chroma DB client (local mode)
chroma_client = chromadb.Client(Settings(
    #chroma_api_impl="local",
    persist_directory="chroma_db"
))

# Create a collection
collection = chroma_client.get_or_create_collection(name="amazon_entities")

config = {
    "data_path": "/content/train_40k.csv",
    "kg_output_path": "output/amazon_kg.json",
    "vectdb_output_path": "output/amazon_embeddings.json",
}

# Ensure output directories exist
os.makedirs(os.path.dirname(config["kg_output_path"]), exist_ok=True)
os.makedirs(os.path.dirname(config["vectdb_output_path"]), exist_ok=True)

# Load dataset
df = pd.read_csv(config["data_path"])
records = df.to_dict(orient="records")

# Build KG triples and generate embeddings
triples = []
entities = set()

for row in tqdm(records):
    pid = row["productId"]
    cat1 = row["Cat1"]
    cat2 = row["Cat2"]
    cat3 = row["Cat3"]

    triples.extend([
        (pid, "belongs_to", cat1),
        (cat1, "subclass_of", cat2),
        (cat2, "subclass_of", cat3),
    ])
    entities.update([pid, cat1, cat2, cat3])

# Generate dummy embeddings
embedding_dim = 128
embeddings = {ent: np.random.rand(embedding_dim).tolist() for ent in entities}

# Add to Chroma
ids = list(embeddings.keys())
batch_size = 5000 # Define a batch size smaller than the max

# Iterate through the ids in chunks and add to Chroma otherwise total size will
#exceed the allowed limit
for i in tqdm(range(0, len(ids), batch_size)):
    batch_ids = ids[i:i + batch_size]
    batch_embeddings = [embeddings[id] for id in batch_ids]
    batch_documents = batch_ids

    collection.add(
        ids=batch_ids,
        embeddings=batch_embeddings,
        documents=batch_documents
    )


# Save KG triples
with open(config["kg_output_path"], "w") as f:
    json.dump([{"head": h, "relation": r, "tail": t} for (h, r, t) in triples], f, indent=2)

# Save vector embeddings
with open(config["vectdb_output_path"], "w") as f:
    json.dump(embeddings, f, indent=2)

#test query
query_vector = np.random.rand(embedding_dim).tolist()
results = collection.query(query_embeddings=[query_vector], n_results=5)
print("Sample vector DB results:", results)

print(f"✅ Knowledge Graph saved to {config['kg_output_path']}")
print(f"✅ Embeddings saved to {config['vectdb_output_path']}")


100%|██████████| 40000/40000 [00:00<00:00, 475018.22it/s]
100%|██████████| 5/5 [00:14<00:00,  2.96s/it]


Sample vector DB results: {'ids': [['B0000ABOHR', 'B00009XO8U', 'B000FJTZLQ', 'B0006MU0PM', 'B00000IZHP']], 'embeddings': None, 'documents': [['B0000ABOHR', 'B00009XO8U', 'B000FJTZLQ', 'B0006MU0PM', 'B00000IZHP']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[None, None, None, None, None]], 'distances': [[13.557466506958008, 14.204630851745605, 14.293858528137207, 14.384225845336914, 14.458332061767578]]}
✅ Knowledge Graph saved to output/amazon_kg.json
✅ Embeddings saved to output/amazon_embeddings.json
