# ChromaDB Explorer

This notebook helps you explore the ChromaDB collections for the medical instructions RAG system.


In [1]:
import os
from pathlib import Path
import chromadb
from dotenv import load_dotenv
import pandas as pd

load_dotenv()

CHROMA_DIR = Path(os.getenv("CHROMA_DIR", "storage/chroma"))
CHROMA_MEDICINES_COLLECTION = os.getenv("CHROMA_MEDICINES_COLLECTION", "medicines")
CHROMA_RAG_COLLECTION = os.getenv("CHROMA_RAG_COLLECTION", "instruction_chunks")

print(f"ChromaDB location: {CHROMA_DIR}")


ChromaDB location: storage/chroma


## Connect to ChromaDB


In [2]:
client = chromadb.PersistentClient(path=str(CHROMA_DIR))
collections = client.list_collections()

print(f"Available collections: {len(collections)}")
for col in collections:
    print(f"  - {col.name}: {col.count()} documents")


Available collections: 1
  - medicines: 582 documents


## Explore Medicines Collection

In [3]:
try:
    medicines_collection = client.get_collection(CHROMA_MEDICINES_COLLECTION)
    
    # Get all data
    results = medicines_collection.get()
    
    print(f"Total medicines: {len(results['ids'])}")
    print(f"Has embeddings: {bool(results.get('embeddings'))}")
    
    if results.get('embeddings'):
        print(f"Embedding dimensions: {len(results['embeddings'][0]) if results['embeddings'] else 'N/A'}")
except Exception as e:
    print(f"Collection not found: {e}")


Total medicines: 582
Has embeddings: False


### View Sample Documents


In [4]:
if 'results' in locals() and results['ids']:
    # Show first 5 medicines
    for i in range(min(5, len(results['ids']))):
        print(f"\n{'='*60}")
        print(f"Medicine {i+1}: {results['ids'][i]}")
        print(f"{'='*60}")
        
        metadata = results['metadatas'][i] if results['metadatas'] else {}
        document = results['documents'][i] if results['documents'] else ""
        
        print(f"Ukrainian name: {metadata.get('ukrainian_name', 'N/A')}")
        print(f"International name: {metadata.get('international_name', 'N/A')}")
        print(f"Status: {metadata.get('fetch_status', 'N/A')}")
        print(f"HTML file: {metadata.get('html_file_path', 'N/A')}")
        print(f"MHT file: {metadata.get('mht_file_path', 'N/A')}")
        print(f"Instruction URL: {metadata.get('instruction_url', 'N/A')}")
        if document:
            print(f"Document preview: {document[:100]}..." if len(document) > 100 else f"Document: {document}")



Medicine 1: 4B54FAC4830861F2C22584B6002A242C
Ukrainian name: 1-МЕТИЛ-2-((ФЕНІЛТІО)МЕТИЛ)-3-КАРБЕТОКСИ-4-((ДИМЕТИЛАМІНО) МЕТИЛ)-5-ГІДРОКСИ-6-БРОМІНДОЛ ГІДРОХЛОРИД (УМІФЕНОВІРУ ГІДРОХЛОРИД)
International name: Umifenovir
Status: success
HTML file: data/html/4B54FAC4830861F2C22584B6002A242C.html
MHT file: 
Instruction URL: 
Document preview: 1-МЕТИЛ-2-((ФЕНІЛТІО)МЕТИЛ)-3-КАРБЕТОКСИ-4-((ДИМЕТИЛАМІНО) МЕТИЛ)-5-ГІДРОКСИ-6-БРОМІНДОЛ ГІДРОХЛОРИД...

Medicine 2: F7111E2E9C5AA7C2C225872600211DA9
Ukrainian name: 2,3,4,5,6-ПЕНТАГІДРОКСИКАПРОНОВА КИСЛОТА, КАЛІЄВА СІЛЬ
International name: Mono
Status: success
HTML file: data/html/F7111E2E9C5AA7C2C225872600211DA9.html
MHT file: 
Instruction URL: 
Document: 2,3,4,5,6-ПЕНТАГІДРОКСИКАПРОНОВА КИСЛОТА, КАЛІЄВА СІЛЬ

Medicine 3: 76AFFBAF0B19FF39C225872600217C5C
Ukrainian name: 2,3,4,5,6-ПЕНТАГІДРОКСИКАПРОНОВА КИСЛОТА, МАГНІЄВА СІЛЬ
International name: Mono
Status: success
HTML file: data/html/76AFFBAF0B19FF39C225872600217C5C.html
MHT file: 
Instruction UR

### Statistics


In [5]:
if 'results' in locals() and results['metadatas']:
    # Count by status
    statuses = {}
    for metadata in results['metadatas']:
        status = metadata.get('fetch_status', 'unknown')
        statuses[status] = statuses.get(status, 0) + 1
    
    print("Fetch Status Breakdown:")
    for status, count in sorted(statuses.items()):
        print(f"  {status}: {count}")
    
    # Count with MHT files
    with_mht = sum(1 for m in results['metadatas'] if m.get('mht_file_path'))
    print(f"\nMedicines with MHT files: {with_mht} / {len(results['ids'])}")
    
    # Count with HTML files
    with_html = sum(1 for m in results['metadatas'] if m.get('html_file_path'))
    print(f"Medicines with HTML files: {with_html} / {len(results['ids'])}")


Fetch Status Breakdown:
  pending: 482
  success: 100

Medicines with MHT files: 28 / 582
Medicines with HTML files: 100 / 582


### Create DataFrame for Easy Analysis


In [6]:
if 'results' in locals() and results['ids']:
    # Create DataFrame from metadata
    df = pd.DataFrame(results['metadatas'])
    df['id'] = results['ids']
    
    # Reorder columns
    cols = ['id', 'ukrainian_name', 'international_name', 'medicinal_product_name', 
            'fetch_status', 'html_file_path', 'mht_file_path', 'instruction_url']
    available_cols = [c for c in cols if c in df.columns]
    df = df[available_cols + [c for c in df.columns if c not in available_cols]]
    
    print(f"DataFrame shape: {df.shape}")
    print(f"\nFirst few rows:")
    display(df.head(10))


DataFrame shape: (582, 11)

First few rows:


Unnamed: 0,id,ukrainian_name,international_name,medicinal_product_name,fetch_status,html_file_path,mht_file_path,instruction_url,created_at,fetch_timestamp,updated_at
0,4B54FAC4830861F2C22584B6002A242C,1-МЕТИЛ-2-((ФЕНІЛТІО)МЕТИЛ)-3-КАРБЕТОКСИ-4-((Д...,Umifenovir,1-МЕТИЛ-2-((ФЕНІЛТІО)МЕТИЛ)-3-КАРБЕТОКСИ-4-((Д...,success,data/html/4B54FAC4830861F2C22584B6002A242C.html,,,2025-12-15T17:17:25.559977,2025-12-15T17:38:00.357707,2025-12-15T17:38:00.359729
1,F7111E2E9C5AA7C2C225872600211DA9,"2,3,4,5,6-ПЕНТАГІДРОКСИКАПРОНОВА КИСЛОТА, КАЛІ...",Mono,"2,3,4,5,6-ПЕНТАГІДРОКСИКАПРОНОВА КИСЛОТА, КАЛІ...",success,data/html/F7111E2E9C5AA7C2C225872600211DA9.html,,,2025-12-15T17:17:27.846851,2025-12-15T17:38:01.346255,2025-12-15T17:38:01.348416
2,76AFFBAF0B19FF39C225872600217C5C,"2,3,4,5,6-ПЕНТАГІДРОКСИКАПРОНОВА КИСЛОТА, МАГН...",Mono,"2,3,4,5,6-ПЕНТАГІДРОКСИКАПРОНОВА КИСЛОТА, МАГН...",success,data/html/76AFFBAF0B19FF39C225872600217C5C.html,,,2025-12-15T17:17:27.950204,2025-12-15T17:38:02.308094,2025-12-15T17:38:02.309168
3,C9AB892F576B11B6C225860800243A25,"3-(2,2,2-ТРИМЕТИЛГІДРАЗИНІЙ) ПРОПІОНАТУ ДИГІДР...",Meldonium,"3-(2,2,2-ТРИМЕТИЛГІДРАЗИНІЙ) ПРОПІОНАТУ ДИГІДР...",success,data/html/C9AB892F576B11B6C225860800243A25.html,,,2025-12-15T17:17:28.030245,2025-12-15T17:38:03.358330,2025-12-15T17:38:03.359652
4,D9291902511A677EC2258BDA00493595,3-ДІНІР,Cefdinir,3-ДІНІР,success,data/html/D9291902511A677EC2258BDA00493595.html,data/mht/UA178230101_D929.mht,http://www.drlz.com.ua/ibp/lz_www.nsf/id/8B197...,2025-12-15T17:17:28.106155,2025-12-15T17:38:05.226513,2025-12-15T17:38:05.227604
5,7853AE7986C81731C2258BE9002773D7,3-ДІНІР,Cefdinir,3-ДІНІР,success,data/html/7853AE7986C81731C2258BE9002773D7.html,data/mht/UA178230101_7853.mht,http://www.drlz.com.ua/ibp/lz_www.nsf/id/B17E9...,2025-12-15T17:17:28.181587,2025-12-15T17:38:06.959982,2025-12-15T17:38:06.961783
6,FC8276D549F8731CC2258BAF0039D317,3-ДІНІР,Cefdinir,3-ДІНІР,success,data/html/FC8276D549F8731CC2258BAF0039D317.html,,,2025-12-15T17:17:28.257320,2025-12-15T17:38:07.957645,2025-12-15T17:38:07.958937
7,E84CFE02A075231FC2258BF8003FEEAB,3-ДІНІР,Cefdinir,3-ДІНІР,success,data/html/E84CFE02A075231FC2258BF8003FEEAB.html,data/mht/UA178490101_E84C.mht,http://www.drlz.com.ua/ibp/lz_www.nsf/id/52AE1...,2025-12-15T17:17:28.332914,2025-12-15T17:38:09.618749,2025-12-15T17:38:09.619271
8,D4158EF86161F3A6C2258BDA004975B7,3-ДІНІР,Cefdinir,3-ДІНІР,success,data/html/D4158EF86161F3A6C2258BDA004975B7.html,data/mht/UA178490101_D415.mht,http://www.drlz.com.ua/ibp/lz_www.nsf/id/3B4C6...,2025-12-15T17:17:28.422884,2025-12-15T17:38:11.387890,2025-12-15T17:38:11.388895
9,60A8AF295D6D172BC225860000397F65,3-МЕТИЛБУТИРОВА КИСЛОТА,Mono,3-МЕТИЛБУТИРОВА КИСЛОТА,success,data/html/60A8AF295D6D172BC225860000397F65.html,,,2025-12-15T17:17:28.498673,2025-12-15T17:38:12.391401,2025-12-15T17:38:12.392735


### Filter and Search


In [7]:
# Example: Find medicines by status
if 'df' in locals():
    print("Medicines with 'success' status:")
    success_df = df[df['fetch_status'] == 'success']
    print(f"Count: {len(success_df)}")
    display(success_df[['id', 'ukrainian_name', 'fetch_status']].head())
    
    print("\nMedicines with 'failed' status:")
    failed_df = df[df['fetch_status'] == 'failed']
    print(f"Count: {len(failed_df)}")
    if len(failed_df) > 0:
        display(failed_df[['id', 'ukrainian_name', 'fetch_status', 'error_message']].head())


Medicines with 'success' status:
Count: 100


Unnamed: 0,id,ukrainian_name,fetch_status
0,4B54FAC4830861F2C22584B6002A242C,1-МЕТИЛ-2-((ФЕНІЛТІО)МЕТИЛ)-3-КАРБЕТОКСИ-4-((Д...,success
1,F7111E2E9C5AA7C2C225872600211DA9,"2,3,4,5,6-ПЕНТАГІДРОКСИКАПРОНОВА КИСЛОТА, КАЛІ...",success
2,76AFFBAF0B19FF39C225872600217C5C,"2,3,4,5,6-ПЕНТАГІДРОКСИКАПРОНОВА КИСЛОТА, МАГН...",success
3,C9AB892F576B11B6C225860800243A25,"3-(2,2,2-ТРИМЕТИЛГІДРАЗИНІЙ) ПРОПІОНАТУ ДИГІДР...",success
4,D9291902511A677EC2258BDA00493595,3-ДІНІР,success



Medicines with 'failed' status:
Count: 0


In [11]:
# Example: Search by name (case-insensitive)
if 'df' in locals():
    search_term = "ТРИМЕТИЛГІДРАЗИНІЙ"  # Change this to search for different medicines
    
    mask = df['ukrainian_name'].str.contains(search_term, case=False, na=False) | \
           df['international_name'].str.contains(search_term, case=False, na=False)
    
    matches = df[mask]
    print(f"Found {len(matches)} medicines matching '{search_term}':")
    if len(matches) > 0:
        display(matches[['id', 'ukrainian_name', 'international_name', 'fetch_status']])


Found 1 medicines matching 'ТРИМЕТИЛГІДРАЗИНІЙ':


Unnamed: 0,id,ukrainian_name,international_name,fetch_status
3,C9AB892F576B11B6C225860800243A25,"3-(2,2,2-ТРИМЕТИЛГІДРАЗИНІЙ) ПРОПІОНАТУ ДИГІДР...",Meldonium,success


In [12]:
try:
    rag_collection = client.get_collection(CHROMA_RAG_COLLECTION)
    
    rag_results = rag_collection.get(limit=10)
    
    print(f"Total chunks: {rag_collection.count()}")
    print(f"Sample chunks: {len(rag_results['ids'])}")
    
    if rag_results.get('embeddings'):
        print(f"Embedding dimensions: {len(rag_results['embeddings'][0])}")
    
    # Show first chunk
    if rag_results['ids']:
        print(f"\nFirst chunk:")
        print(f"ID: {rag_results['ids'][0]}")
        print(f"Document: {rag_results['documents'][0][:200]}...")
        if rag_results['metadatas']:
            print(f"Metadata: {rag_results['metadatas'][0]}")
except Exception as e:
    print(f"RAG collection not found yet (will be created in Phase 2): {e}")


RAG collection not found yet (will be created in Phase 2): Collection [instruction_chunks] does not exist
