In [13]:
import json
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
import os
from typing import List
import chromadb

In [14]:
with open('data_set.json', 'r') as file:
        data = json.load(file)

In [15]:
class GeminiEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = 'AIzaSyCGsj7XUUDktYTIqS3ITCOIk54oN7OD9dw'
        if not gemini_api_key:
            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
        
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        
        response = genai.embed_content(model=model, 
                                       content=input,
                                       task_type="retrieval_document", 
                                       title=title)
        return response["embedding"]

In [16]:
def create_chroma_db(documents: List[dict], path: str, name: str):
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    # Ensure the document is a dictionary
    for doc in documents:
        if not isinstance(doc, dict):
            print(f"Invalid document format: {doc}")
            continue
        
        doc_id = doc.get("id", "")
        content = doc.get("content", "")
        
        # Convert keywords list to a comma-separated string
        keywords = doc.get("metadata", {}).get("keywords", [])
        if isinstance(keywords, list):
            keywords = ", ".join(keywords)  # Convert list to string

        metadata = {
            "title": doc.get("title", ""),
            "category": doc.get("category", ""),
            "keywords": keywords  # Store as a string
        }

        if content:
            # Add document to the ChromaDB collection
            db.add(documents=[content], ids=[doc_id], metadatas=[metadata])
        else:
            print(f"Document {doc_id} has no content. Skipping...")

    return db, name

# Save the documents to ChromaDB
db, name = create_chroma_db(documents=data, 
                            path="D:\\subject projects\\RAG\\database", 
                            name="hospital_documents_data2")
