In [7]:
%pip install -q chromadb

import uuid
from chromadb.config import Settings
import chromadb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [None]:
# Step 1 - Setup Chroma

def chroma_client_in_memory() -> chromadb.Client:
    """
    สร้าง Chroma client ในโหมดฝังในโปรเซส (embedded) 
    — เก็บในหน่วยความจำ (in-memory) โดยดีฟอลต์ => ข้อมูลหายเมื่อโปรเซส/โน้ตบุ๊กปิดหรือรีสตาร์ท
    """
    return chromadb.Client()

def chroma_client_persistent(persist_directory: str = "./chroma_db") -> chromadb.PersistentClient:
    """
    สร้าง Chroma client ในโหมดฝังในโปรเซส (embedded) 
    — เก็บแบบถาวร (persistent) ในไดเรกทอรีที่ระบุ
    - ควรเรียกใช้ client.persist() เมื่อมีการแก้ไขข้อมูล (เพิ่ม ลบ อัปเดต) เพื่อการันตีการบันทึกการเปลี่ยนแปลงลงดิสก์""
    """

    return chromadb.PersistentClient(path=persist_directory)

def chroma_client_http() -> chromadb.HttpClient:
    """
    Run chroma server options
    1. cmd: chroma run
    2. docker compose → compose.yml

    Returns:
        chromadb.HttpClient: 
        - to check connection: telnet 127.0.0.1 8000
    """
    return chromadb.HttpClient(host="localhost", port=8000)

# client = chroma_client_in_memory()
# client = chroma_client_persistent("./chroma_vecter_text_db")
client = chroma_client_http()



In [9]:
# Step 2 - สร้าง Collection พร้อมฟังก์ชันฝังตัว (embedding function) สำหรับ ข้อความ(text embedding)
# https://docs.trychroma.com/docs/embeddings/embedding-functions#default-all-minilm-l6-v2
collection = client.get_or_create_collection(
    name="demo_collection",
    embedding_function=chromadb.utils.embedding_functions.DefaultEmbeddingFunction()
)

In [10]:
# Step 3 - เพิ่มเอกสารเข้า collection
docs = [
    ("1", "The quick brown fox jumps over the lazy dog"),
    ("2", "A fast auburn fox leaps above a sleepy canine"),
    ("3", "An article about database systems and vector search"),
    ("4", "Deep learning and embeddings for natural language processing"),
]

# ids = [d[0] for d in docs]
ids = [str(uuid.uuid4()) for _ in docs]

documents = [d[1] for d in docs]
# Add documents. If documents with same ids already exist Chroma will raise,
try:
    collection.delete(ids=ids)
except Exception:
    pass

collection.add(ids=ids, documents=documents)

In [11]:
# Step 4 - ทดสอบ Query
def print_result_x(idx, result):
    print(f"{idx+1}. Query: {query_texts[idx]!r}")
    
    # result is a dict-of-lists
    docs_out = result.get("documents", [[]])[idx]
    dists_out = result.get("distances", [[]])[idx]
    for doc, dist in zip(docs_out, dists_out):
        print(f"- distance={dist:.6f}  content={doc}")

query1 = "fox dog"
query2 = "database vector search"
query_texts=[query1, query2]

result = collection.query(
    query_texts=query_texts,
    n_results=2,
    include=["distances", "documents", "metadatas"],
)

for idx in range(len(query_texts)):
    print_result_x(idx, result)

1. Query: 'fox dog'
- distance=0.681373  content=The quick brown fox jumps over the lazy dog
- distance=0.841353  content=A fast auburn fox leaps above a sleepy canine
2. Query: 'database vector search'
- distance=0.270411  content=An article about database systems and vector search
- distance=1.577169  content=Deep learning and embeddings for natural language processing


In [None]:
"""
Check cache ได้ที่ path ~/.cache/chroma/onnx_models
"""
!du -h ~/.cache/chroma/onnx_models 2>/dev/null || echo 'No cache directory found'