In [1]:
from dotenv import load_dotenv
import os

load_dotenv(r'C:\My Projects\Health-Navigator\credentials.env')

True

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
import chromadb
from llama_index.core import VectorStoreIndex, Document, StorageContext, Settings
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.node_parser import SentenceSplitter
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.embeddings.google_genai import GoogleGenAIEmbedding
from typing import List, Dict
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.core.vector_stores import MetadataFilters, ExactMatchFilter
from llama_index.core.vector_stores import MetadataFilter, FilterOperator

import os


class HybridVectorDB:
    def __init__(self, db_path: str = "./chroma_db", collection_name: str = "documents", 
                 google_api_key: str = None, model_name: str = "models/embedding-001"):
        """Initialize connection to existing ChromaDB with Google embeddings."""

        self.all_nodes = []
        self.db_path = db_path
        self.collection_name = collection_name
        
        # Setup Google embeddings
        api_key = google_api_key or os.getenv("GOOGLE_API_KEY")
        if not api_key:
            raise ValueError("Google API key required. Pass google_api_key or set GOOGLE_API_KEY env variable")
        
        self.llm = GoogleGenAI(model="gemini-2.5-flash-lite-preview-09-2025", api_key=api_key)
        
        self.embed_model = GoogleGenAIEmbedding(
            model_name=model_name,
            api_key=api_key
        )
        Settings.embed_model = self.embed_model
        
        # Verify DB exists
        if not os.path.exists(db_path):
            raise FileNotFoundError(f"ChromaDB path '{db_path}' does not exist")
        
        # Connect to ChromaDB
        self.client = chromadb.PersistentClient(path=db_path)
        self.collection = self.client.get_or_create_collection(
            collection_name,
                configuration={
            "hnsw": {"space": "cosine"},
        }
            )
        
        # Setup vector store
        self.vector_store = ChromaVectorStore(chroma_collection=self.collection)
        self.storage_context = StorageContext.from_defaults(vector_store=self.vector_store)
        
        # Load index
        self.index = VectorStoreIndex.from_vector_store(
            self.vector_store,
            embed_model=self.embed_model
        )

        self.parser = SentenceSplitter(
        chunk_size=1024,  # Larger for medical context
        chunk_overlap=200,
        separator="\n\n"  # Split by paragraphs/sections
        )

        try:
            # Load existing nodes from vector store
            retriever = self.index.as_retriever(similarity_top_k=10000)
            self.all_nodes = retriever._get_nodes()
        except:
            try:
                docstore = self.index.docstore
                if docstore:
                    self.all_nodes = list(docstore.docs.values())
            except:
                self.all_nodes = []


    def _date_to_int(self, date_str) -> int:
        """Convert date string 'YYYY-MM-DD' to integer YYYYMMDD."""
        if isinstance(date_str, int):
            return date_str
        return int(date_str.replace("-", ""))
    
    def add_text(self, text: str, metadata: Dict = None) -> bool:
        """
        Add text to the database.
        
        Args:
            text: String content to add
            metadata: Optional metadata dictionary
            
        Returns:
            bool: True if successful
        """

        metadata = metadata or {}

        # Before inserting, convert date to int if present
        if metadata and "date" in metadata:
            metadata["date"] = self._date_to_int(metadata["date"])

        try:
            # Create document
            doc = Document(text=text, metadata=metadata or {})
            
            # Parse into nodes
            nodes = self.parser.get_nodes_from_documents([doc])
            
            # Insert into index
            self.index.insert_nodes(nodes)
            
            # Store for BM25
            self.all_nodes.extend(nodes)
            
            return True
        except Exception as e:
            print(f"Error adding text: {e}")
            return False
    
    def retrieve(self, query: str, top_k: int = 40, initial_k: int = None, 
             similarity_threshold = 0.3, filters: Dict = None,
             date: str = None, date_filter: str = None):
        """
        Retrieve relevant texts using hybrid search (semantic + BM25).
        
        Args:
            query: Search query string
            top_k: Number of results to return
            
        Returns:
            List of retrieved text strings
        """

        
        try:
            # Get all nodes for BM25

            initial_k = initial_k or max(top_k * 3, 30)
            all_nodes = self.all_nodes
            
            if not all_nodes:
                return []
            
            metadata_filters = None
            if filters or date:
                filter_list = []
                
                # Add existing filters
                if filters:
                    filter_list.extend([
                        ExactMatchFilter(key=k, value=v) 
                        for k, v in filters.items()
                    ])
                
                # Add date filter
                if date and date_filter:
                    operator_map = {
                        "before": FilterOperator.LTE,
                        "at": FilterOperator.EQ,
                        "after": FilterOperator.GTE
                    }
                    if date_filter not in operator_map:
                        raise ValueError("date_filter must be 'before', 'at', or 'after'")
                    
                    filter_list.append(
                        MetadataFilter(key="date", value=self._date_to_int(date), operator=operator_map[date_filter])
                    )
                
                if filter_list:
                    metadata_filters = MetadataFilters(filters=filter_list)
            
            # Semantic retriever
            vector_retriever = VectorIndexRetriever(
                index=self.index,
                similarity_top_k=initial_k,
                filters=metadata_filters
            )

            filtered_nodes = all_nodes
            if filters or date:
                filtered_nodes = [
                    node for node in all_nodes 
                    if all(node.metadata.get(k) == v for k, v in (filters or {}).items())
                ]

                # Apply date filter for BM25
                if date and date_filter:
                    date_int = self._date_to_int(date)
                    filtered_nodes = [
                        node for node in filtered_nodes
                        if node.metadata.get("date") and (
                            (date_filter == "before" and self._date_to_int(node.metadata.get("date")) <= date_int) or
                            (date_filter == "at" and self._date_to_int(node.metadata.get("date")) == date_int) or
                            (date_filter == "after" and self._date_to_int(node.metadata.get("date")) >= date_int)
                        )
                    ]

            # Check if we have nodes before creating BM25 retriever
            if not filtered_nodes:
                # No matching nodes for BM25, use only vector retriever
                retrieved_nodes = vector_retriever.retrieve(query)
            else:
                bm25_retriever = BM25Retriever.from_defaults(
                    nodes=filtered_nodes,
                    similarity_top_k=initial_k,
                )
                
                # Hybrid retriever
                hybrid_retriever = QueryFusionRetriever(
                    retrievers=[vector_retriever, bm25_retriever],
                    similarity_top_k=initial_k,
                    num_queries=3,
                    mode="reciprocal_rerank",
                    use_async=True,
                    llm=self.llm
                )
                
                # Retrieve
                retrieved_nodes = hybrid_retriever.retrieve(query)

            # Filter by similarity threshold

            retrieved_nodes = [node for node in retrieved_nodes if hasattr(node, 'score') and node.score is not None and node.score >= similarity_threshold]
            
            # Extract text
            results = [
                {"text": node.text, "metadata": node.metadata, "score": node.score}
                for node in retrieved_nodes[:top_k]
            ]
            return results
            
        except Exception as e:
            print(f"Error retrieving: {e}")
            import traceback
            traceback.print_exc()
            return []


resource module not available on Windows


In [8]:
db = HybridVectorDB(
    db_path="./chroma_db", 
    collection_name="documents",
    # google_api_key=os.getenv("GOOGLE_API_KEY")  # or set GOOGLE_API_KEY env variable
)

# Add information
success = db.add_text(
    text="ChromaDB is an open-source vector database designed for AI applications.",
    metadata={"source": "documentation", "topic": "databases"}
)
print(f"Added: {success}")

documents = [
"""Cardiovascular disease remains one of the leading causes of death worldwide. Major risk factors include high blood pressure, elevated cholesterol levels, smoking, physical inactivity, and poor diet. Regular aerobic exercise improves heart function by strengthening the cardiac muscle and improving blood circulation. Diets rich in fruits, vegetables, whole grains, and healthy fats such as olive oil have been shown to reduce inflammation and lower the risk of coronary artery disease. Early screening and consistent lifestyle changes play a crucial role in prevention.""",

"""Machine learning is increasingly used in healthcare to assist in diagnosis, prediction, and treatment planning. Models trained on electronic health records can predict patient readmission, detect anomalies in lab results, and support clinical decision-making. However, challenges remain, including data bias, lack of interpretability, and privacy concerns. Explainable AI methods aim to make predictions more transparent so clinicians can trust and validate model outputs.""",

"""Climate change refers to long-term shifts in temperature and weather patterns, primarily driven by human activities such as burning fossil fuels. Rising global temperatures have led to more frequent heatwaves, melting glaciers, rising sea levels, and extreme weather events. These changes threaten food security, ecosystems, and human health. International cooperation and sustainable energy transitions are essential to mitigate future damage.""",

"""Scalable software systems are designed to handle increasing workloads without performance degradation. Common approaches include microservices architecture, horizontal scaling, load balancing, and caching. Distributed systems must also address challenges such as network latency, fault tolerance, and data consistency. Choosing the right architecture depends on traffic patterns, business requirements, and operational complexity.""",

"""Sleep is essential for cognitive performance, memory consolidation, and emotional regulation. Chronic sleep deprivation negatively affects attention, reaction time, and decision-making. Deep sleep stages play a critical role in learning, while REM sleep is linked to creativity and problem-solving. Maintaining a consistent sleep schedule and limiting screen exposure before bedtime can significantly improve mental focus and productivity."""
]

documents.extend([
    """Artificial intelligence systems rely heavily on high-quality data for training and evaluation. Poorly labeled or imbalanced datasets can lead to biased models that perform well in controlled environments but fail in real-world scenarios. Data preprocessing steps such as normalization, feature selection, and outlier removal are critical for improving model robustness. Continuous monitoring is also required to detect data drift over time.""",

    """The human brain processes information through complex networks of neurons that communicate using electrical and chemical signals. Learning occurs when synaptic connections strengthen or weaken based on experience. Neuroplasticity allows the brain to adapt after injury and during skill acquisition. Understanding these mechanisms has influenced the development of neural networks in artificial intelligence.""",

    """Cybersecurity focuses on protecting systems, networks, and data from digital attacks. Common threats include phishing, ransomware, and distributed denial-of-service attacks. Effective security strategies combine technical controls such as encryption and intrusion detection with user education and regular system updates. As systems become more interconnected, attack surfaces continue to expand.""",

    """Natural language processing enables machines to understand and generate human language. Techniques range from traditional rule-based systems to modern transformer-based models. Applications include machine translation, sentiment analysis, chatbots, and information retrieval. Evaluation remains challenging due to ambiguity, context dependence, and cultural variation in language use.""",

    """Renewable energy sources such as solar, wind, and hydroelectric power are central to reducing greenhouse gas emissions. Advances in battery storage and smart grids have improved reliability and scalability. Despite higher initial costs, renewables offer long-term economic and environmental benefits. Government policies and incentives play a key role in accelerating adoption."""
])

documents.extend([
    """Distributed systems consist of multiple independent computers that work together as a single system. Key challenges include network latency, partial failures, and data consistency. Techniques such as replication, consensus algorithms, and load balancing are used to improve reliability and scalability. Designing distributed systems requires trade-offs between consistency, availability, and partition tolerance.""",

    """Machine learning models must be evaluated carefully to ensure they generalize beyond training data. Overfitting occurs when a model memorizes patterns rather than learning meaningful representations. Cross-validation, regularization, and proper train-test splits help mitigate this risk. Model interpretability is increasingly important in high-stakes domains such as healthcare and finance.""",

    """Databases are optimized for storing, querying, and managing structured information efficiently. Relational databases use tables and schemas, while NoSQL databases prioritize flexibility and horizontal scalability. Indexing strategies and query optimization significantly affect performance. Choosing the right database depends on workload, consistency requirements, and data access patterns.""",

    """Cloud computing provides on-demand access to computing resources over the internet. Services are typically offered as infrastructure, platforms, or software. Benefits include elasticity, reduced upfront costs, and global availability. However, organizations must manage security, vendor lock-in, and cost optimization carefully.""",

    """Reinforcement learning is a paradigm where agents learn by interacting with an environment and receiving feedback in the form of rewards. Policies are improved through exploration and exploitation. Applications include robotics, game playing, and resource optimization. Training can be unstable and sample-inefficient without careful reward design and tuning."""
])

documents.extend([
    """Electronic health records (EHRs) store comprehensive patient information including medical history, diagnoses, medications, lab results, and clinical notes. Proper structuring of EHR data enables clinical decision support systems, population health analytics, and interoperability across healthcare providers. Data quality, privacy, and standardization remain major challenges.""",

    """Medical imaging techniques such as MRI, CT scans, and X-rays are essential for diagnosis and treatment planning. Advances in deep learning have improved image segmentation, anomaly detection, and disease classification. Despite high accuracy, clinical adoption requires rigorous validation, explainability, and regulatory approval.""",

    """Clinical decision support systems assist healthcare professionals by providing evidence-based recommendations at the point of care. These systems integrate patient data with medical knowledge to flag potential risks, drug interactions, or diagnostic suggestions. Poor system design can lead to alert fatigue and reduced clinician trust.""",

    """Public health surveillance focuses on monitoring disease outbreaks, vaccination coverage, and health trends across populations. Timely data collection and analysis enable early intervention and policy decisions. Digital health tools and real-time dashboards have improved responsiveness during global health emergencies.""",

    """Personalized medicine tailors treatment strategies based on an individualâ€™s genetics, lifestyle, and clinical history. Genomic sequencing and biomarker analysis play a key role in identifying targeted therapies. Ethical concerns include data ownership, equity of access, and long-term storage of sensitive biological data."""
])

documents.extend([
    """Blood tests are among the most common diagnostic tools in medicine, providing insights into organ function, immune response, and metabolic health. Measurements such as complete blood count, liver enzymes, and electrolyte levels help clinicians detect infections, anemia, inflammation, and systemic diseases early.""",

    """Computed Tomography (CT) scans use X-rays and computer processing to create detailed cross-sectional images of the body. They are widely used to detect tumors, internal bleeding, fractures, and lung diseases. CT imaging offers fast results but exposes patients to higher radiation compared to standard X-rays.""",

    """Magnetic Resonance Imaging (MRI) relies on strong magnetic fields and radio waves to generate high-resolution images of soft tissues. MRI is especially useful for brain, spinal cord, joint, and soft tissue evaluation. Unlike CT scans, MRI does not involve ionizing radiation, but scan times are longer.""",

    """X-ray imaging is a fast and widely available diagnostic method commonly used to assess bones, lungs, and the chest cavity. It plays a critical role in detecting fractures, pneumonia, and dental issues. Although radiation exposure is low, repeated imaging should be clinically justified.""",

    """Ultrasound imaging uses high-frequency sound waves to visualize internal organs and blood flow in real time. It is commonly used in obstetrics, cardiology, and abdominal examinations. Ultrasound is safe, portable, and radiation-free, but image quality depends heavily on operator skill.""",

    """Blood chemistry panels measure substances such as glucose, cholesterol, creatinine, and electrolytes. These tests help evaluate kidney function, cardiovascular risk, and metabolic disorders like diabetes. Abnormal values often require follow-up testing and clinical correlation.""",

    """Histopathology involves microscopic examination of tissue samples obtained through biopsy or surgery. It is the gold standard for diagnosing many cancers and inflammatory diseases. Accurate interpretation depends on proper sample preparation and experienced pathologists.""",

    """Nuclear medicine scans use small amounts of radioactive tracers to assess organ function and metabolic activity. Tests such as PET and SPECT scans are valuable in oncology, cardiology, and neurology. These scans provide functional information rather than detailed anatomical structure.""",

    """Serological tests detect antibodies or antigens in the blood to identify infections, autoimmune disorders, or immune responses. They are commonly used for diseases such as hepatitis, HIV, and COVID-19. Timing of testing is critical for accurate interpretation of results.""",

    """Pulmonary function tests evaluate lung capacity, airflow, and gas exchange. These tests are essential for diagnosing and monitoring respiratory conditions like asthma, COPD, and pulmonary fibrosis. Results must be interpreted alongside clinical symptoms and imaging findings."""
])

documents = [
    Document(
        text="""The patient underwent an MRI brain scan on 2023-06-14 due to persistent headaches. Compared to the previous scan from 2022, the imaging showed an increase in the size of the benign tumor located in the frontal lobe, prompting a recommendation for closer follow-up and possible neurosurgical consultation.""",
        metadata={"type": "MRI", "body_part": "brain", "date": "2023-06-14", "category": "imaging", "history": "follow_up"}
    ),
    Document(
        text="""A CT scan of the chest was performed on 2024-01-22 after the patient reported shortness of breath. The scan revealed mild progression of previously noted lung nodules when compared to imaging from the prior year.""",
        metadata={"type": "CT", "body_part": "chest", "date": "2024-01-22", "category": "imaging", "history": "comparison"}
    ),
    Document(
        text="""The patient completed a full blood panel on 2023-11-03. Results indicated elevated cholesterol and slightly increased fasting glucose levels compared to earlier tests.""",
        metadata={"type": "blood_test", "test": "full_panel", "date": "2023-11-03", "category": "lab", "history": "trend"}
    ),
    Document(
        text="""An abdominal ultrasound conducted on 2022-09-18 showed fatty liver changes consistent with earlier findings. No significant progression was observed since the last ultrasound.""",
        metadata={"type": "ultrasound", "body_part": "abdomen", "date": "2022-09-18", "category": "imaging", "history": "stable"}
    ),
    Document(
        text="""The patient had an X-ray of the right knee on 2023-04-10 following chronic pain complaints. Imaging demonstrated increased joint space narrowing compared to previous studies.""",
        metadata={"type": "xray", "body_part": "knee", "date": "2023-04-10", "category": "imaging", "history": "progression"}
    ),
    Document(
        text="""An MRI of the lumbar spine was performed on 2024-02-05 due to lower back pain radiating to the leg. The scan showed a slightly larger disc herniation at L5-S1 compared to the MRI from 2022.""",
        metadata={"type": "MRI", "body_part": "lumbar_spine", "date": "2024-02-05", "category": "imaging", "history": "comparison"}
    ),
    Document(
        text="""Blood test results from 2024-03-01 revealed declining hemoglobin levels relative to tests done six months earlier, suggesting developing anemia.""",
        metadata={"type": "blood_test", "test": "hemoglobin", "date": "2024-03-01", "category": "lab", "history": "trend"}
    ),
    Document(
        text="""A cardiac echocardiogram completed on 2023-08-27 showed stable left ventricular function compared to the prior exam, with mild valve regurgitation.""",
        metadata={"type": "echocardiogram", "body_part": "heart", "date": "2023-08-27", "category": "imaging", "history": "stable"}
    ),
    Document(
        text="""The patient underwent a follow-up mammogram on 2024-01-15. Imaging showed a slight increase in the size of a previously identified mass.""",
        metadata={"type": "mammogram", "body_part": "breast", "date": "2024-01-15", "category": "imaging", "history": "follow_up"}
    ),
    Document(
        text="""A PET scan performed on 2023-12-09 for cancer monitoring demonstrated increased metabolic activity in a known lesion compared to earlier scans.""",
        metadata={"type": "PET", "body_part": "whole_body", "date": "2023-12-09", "category": "imaging", "history": "oncology_follow_up"}
    ),


    Document(
        text="""This is a new blood test, Blood test results from 2025-03-01 revealed declining hemoglobin levels relative to tests done six months earlier, suggesting developing anemia.""",
        metadata={"type": "blood_test", "test": "hemoglobin", "date": "2025-03-01", "category": "lab", "history": "trend"}
    ),
]

for document in documents:
    db.add_text(document.text, document.metadata)

Added: True


In [18]:
query = "Blood tests done for the user"
results = db.retrieve(query, top_k=100, similarity_threshold = 0.05, filters={"type": "blood_test"}, date="2024-06-01", date_filter="before")

print(f"\nQuery: {query}")
print(f"Results found: {len(results)}\n")

for i, result in enumerate(results, 1):
    print(f"{i}. {result}\n")

As bm25s.BM25 requires k less than or equal to number of nodes added. Overriding the value of similarity_top_k to number of nodes added.



Query: Blood tests done for the user
Results found: 2

1. {'text': 'The patient completed a full blood panel on 2023-11-03. Results indicated elevated cholesterol and slightly increased fasting glucose levels compared to earlier tests.', 'metadata': {'type': 'blood_test', 'test': 'full_panel', 'date': 20231103, 'category': 'lab', 'history': 'trend'}, 'score': 0.19444124222184772}

2. {'text': 'Blood test results from 2024-03-01 revealed declining hemoglobin levels relative to tests done six months earlier, suggesting developing anemia.', 'metadata': {'type': 'blood_test', 'test': 'hemoglobin', 'date': 20240301, 'category': 'lab', 'history': 'trend'}, 'score': 0.1929544040629445}



# Testing from the .py stript (That's used for production)

In [1]:
from vectordb import HybridVectorDB

resource module not available on Windows


In [2]:
db = HybridVectorDB(
    user_id="12345",
    db_path="./chroma_db",
)

  import pynvml  # type: ignore[import]


In [5]:
from llama_index.core import VectorStoreIndex, Document, StorageContext, Settings


In [5]:
documents = [
    Document(
        text="""The patient underwent an MRI brain scan on 2023-06-14 due to persistent headaches. Compared to the previous scan from 2022, the imaging showed an increase in the size of the benign tumor located in the frontal lobe, prompting a recommendation for closer follow-up and possible neurosurgical consultation.""",
        metadata={"type": "MRI", "body_part": "brain", "date": "2023-06-14", "category": "imaging", "history": "follow_up"}
    ),
    Document(
        text="""A CT scan of the chest was performed on 2024-01-22 after the patient reported shortness of breath. The scan revealed mild progression of previously noted lung nodules when compared to imaging from the prior year.""",
        metadata={"type": "CT", "body_part": "chest", "date": "2024-01-22", "category": "imaging", "history": "comparison"}
    ),
    Document(
        text="""The patient completed a full blood panel on 2023-11-03. Results indicated elevated cholesterol and slightly increased fasting glucose levels compared to earlier tests.""",
        metadata={"type": "blood_test", "test": "full_panel", "date": "2023-11-03", "category": "lab", "history": "trend"}
    ),
    Document(
        text="""An abdominal ultrasound conducted on 2022-09-18 showed fatty liver changes consistent with earlier findings. No significant progression was observed since the last ultrasound.""",
        metadata={"type": "ultrasound", "body_part": "abdomen", "date": "2022-09-18", "category": "imaging", "history": "stable"}
    ),
    Document(
        text="""The patient had an X-ray of the right knee on 2023-04-10 following chronic pain complaints. Imaging demonstrated increased joint space narrowing compared to previous studies.""",
        metadata={"type": "xray", "body_part": "knee", "date": "2023-04-10", "category": "imaging", "history": "progression"}
    ),
    Document(
        text="""An MRI of the lumbar spine was performed on 2024-02-05 due to lower back pain radiating to the leg. The scan showed a slightly larger disc herniation at L5-S1 compared to the MRI from 2022.""",
        metadata={"type": "MRI", "body_part": "lumbar_spine", "date": "2024-02-05", "category": "imaging", "history": "comparison"}
    ),
    Document(
        text="""Blood test results from 2024-03-01 revealed declining hemoglobin levels relative to tests done six months earlier, suggesting developing anemia.""",
        metadata={"type": "blood_test", "test": "hemoglobin", "date": "2024-03-01", "category": "lab", "history": "trend"}
    ),
    Document(
        text="""A cardiac echocardiogram completed on 2023-08-27 showed stable left ventricular function compared to the prior exam, with mild valve regurgitation.""",
        metadata={"type": "echocardiogram", "body_part": "heart", "date": "2023-08-27", "category": "imaging", "history": "stable"}
    ),
    Document(
        text="""The patient underwent a follow-up mammogram on 2024-01-15. Imaging showed a slight increase in the size of a previously identified mass.""",
        metadata={"type": "mammogram", "body_part": "breast", "date": "2024-01-15", "category": "imaging", "history": "follow_up"}
    ),
    Document(
        text="""A PET scan performed on 2023-12-09 for cancer monitoring demonstrated increased metabolic activity in a known lesion compared to earlier scans.""",
        metadata={"type": "PET", "body_part": "whole_body", "date": "2023-12-09", "category": "imaging", "history": "oncology_follow_up"}
    ),


    Document(
        text="""This is a new blood test, Blood test results from 2025-03-01 revealed declining hemoglobin levels relative to tests done six months earlier, suggesting developing anemia.""",
        metadata={"type": "blood_test", "test": "hemoglobin", "date": "2025-03-01", "category": "lab", "history": "trend"}
    ),
]

for document in documents:
    db.add_text(document.text, document.metadata)

In [8]:
documents = [
    Document(
        text="""    'HighBP': 0, 'HighChol': 0, 'BMI': 28.5, 'Smoker': 0,
    'Stroke': 0, 'Diabetes': 0, 'PhysActivity': 1, 'Fruits': 1,
    'Veggies': 1, 'HvyAlcoholConsump': 0, 'AnyHealthcare': 1,
    'GenHlth': 3, 'MentHlth': 2, 'PhysHlth': 1, 'DiffWalk': 0,
    'Sex': 1, 'Age': 8, 'Education': 5, 'Income': 7""",
        metadata={}
    )
]

for document in documents:
    db.add_text(document.text, document.metadata)

In [None]:
query = "BMI"
results = db.retrieve(query, top_k=100, similarity_threshold = 0.01, filters={"type": "blood_test"}, date="2024-06-01", date_filter="before")

print(f"\nQuery: {query}")
print(f"Results found: {len(results)}\n")

for i, result in enumerate(results, 1):
    print(f"{i}. {result}\n")

As bm25s.BM25 requires k less than or equal to number of nodes added. Overriding the value of similarity_top_k to number of nodes added.



Query: BMI
Results found: 12

1. {'text': "'HighBP': 0, 'HighChol': 0, 'BMI': 28.5, 'Smoker': 0,\n    'Stroke': 0, 'Diabetes': 0, 'PhysActivity': 1, 'Fruits': 1,\n    'Veggies': 1, 'HvyAlcoholConsump': 0, 'AnyHealthcare': 1,\n    'GenHlth': 3, 'MentHlth': 2, 'PhysHlth': 1, 'DiffWalk': 0,\n    'Sex': 1, 'Age': 8, 'Education': 5, 'Income': 7", 'metadata': {}, 'score': 0.09871794871794871}

2. {'text': 'An abdominal ultrasound conducted on 2022-09-18 showed fatty liver changes consistent with earlier findings. No significant progression was observed since the last ultrasound.', 'metadata': {'type': 'ultrasound', 'body_part': 'abdomen', 'date': 20220918, 'category': 'imaging', 'history': 'stable'}, 'score': 0.09499807987711213}

3. {'text': 'The patient underwent an MRI brain scan on 2023-06-14 due to persistent headaches. Compared to the previous scan from 2022, the imaging showed an increase in the size of the benign tumor located in the frontal lobe, prompting a recommendation for clos