In [1]:
import os
import json
import chromadb
from ollama import chat
import ollama
import json
from typing import Dict, List, Union, Literal, Tuple, Optional
from pydantic import BaseModel, Field
import re
from langgraph.graph import StateGraph, START, END
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
def open_json_file_and_extract_text_and_source(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    try:
        text = data['content']['markdown']
        source = os.path.basename(file)
        return text, source
    except:
        raise ValueError("Failed")

def load_json_files(json_path: str) -> tuple[List[str], List[str]]:
    """
    Load JSON files from a path (file or directory) and extract markdown content.

    Args:
        json_path (str): Path to JSON file or directory.

    Returns:
        Tuple[List[str], List[str]]: List of markdown texts and their sources.
    """
    texts = []
    sources = []
    json_path = os.path.normpath(json_path)
    print(f"DEBUG: Loading JSON files from '{json_path}'")
    
    # load files
    if os.path.isfile(json_path):
        files = [json_path]
    else:
        files = [os.path.join(json_path, f) for f in os.listdir(json_path) if f.endswith('.json')]
    
    # Loop for files
    for file in files:
        try:
            text, source = open_json_file_and_extract_text_and_source(file)
            texts.append(text)
            sources.append(source)
        except Exception as e:
            print(f"Error processing '{file}' with utf-8: {str(e)}")
            break

    if not texts:
        print("Warning: No valid JSON data loaded")
    else:
        print(f"DEBUG: Loaded {len(texts)} texts from {len(sources)} sources")
    return texts, sources

texts, sources = load_json_files('./json_data')

DEBUG: Loading JSON files from 'json_data'
DEBUG: Loaded 2 texts from 2 sources


In [3]:
def chunk_text(text: str, chunk_size: int = 3000, chunk_overlap: int = 100) -> List[str]:
    """
    Split text into chunks using LangChain's RecursiveCharacterTextSplitter.

    Args:
        text (str): Input text to chunk.
        chunk_size (int): Target size of each chunk in characters.
        chunk_overlap (int): Overlap between chunks in characters.

    Returns:
        List[str]: List of text chunks.
    """
    try:
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", ". ", " ", ""],  # Prioritize paragraphs, sentences
            length_function=len
        )
        chunks = splitter.split_text(text)
        print(f"DEBUG: Split text into {len(chunks)} chunks")
        return chunks
    except Exception as e:
        print(f"Error chunking text: {str(e)}")
        return [text[:chunk_size]]  # Fallback to single chunk

def store_embeddings(texts: List[str], sources: List[str]) -> chromadb.Collection:
    """
    Store text embeddings in a persistent ChromaDB collection using mxbai-embed-large.

    Args:
        texts (List[str]): List of text content.
        sources (List[str]): List of source filenames.

    Returns:
        chromadb.Collection: ChromaDB collection with stored embeddings.
    """
    client = chromadb.PersistentClient(path=os.path.normpath("chroma_db"))
    collection = client.get_or_create_collection("medical_notes")
    print(f"DEBUG: Initializing collection 'medical_notes' at 'chroma_db'")

    doc_id = 0
    for text, source in zip(texts, sources):
        try:
            # Chunk the text
            chunks = chunk_text(text)
            for chunk_idx, chunk in enumerate(chunks):
                try:
                    response = ollama.embed(model="mxbai-embed-large", input=chunk)
                    embedding = response["embeddings"]
                    if isinstance(embedding, list) and isinstance(embedding[0], list):
                        embedding = embedding[0]
                    collection.add(
                        documents=[chunk],
                        metadatas=[{"source": source, "chunk_idx": chunk_idx}],
                        ids=[f"doc_{doc_id}"],
                        embeddings=[embedding]
                    )
                    print(f"DEBUG: Added embedding for source '{source}', chunk {chunk_idx} (ID: doc_{doc_id})")
                    doc_id += 1
                except Exception as e:
                    print(f"Error adding embedding for '{source}', chunk {chunk_idx}: {str(e)}")
        except Exception as e:
            print(f"Error processing text for '{source}': {str(e)}")

    print(f"DEBUG: Collection 'medical_notes' has {collection.count()} documents")
    return collection

def load_chroma_collection() -> chromadb.Collection:
    """
    Load the existing ChromaDB collection.

    Returns:
        chromadb.Collection: The 'medical_notes' collection.
    """
    client = chromadb.PersistentClient(path=os.path.normpath("chroma_db"))
    collection = client.get_collection("medical_notes")
    print(f"DEBUG: Loaded collection 'medical_notes' with {collection.count()} documents")
    return collection

In [4]:
collection = store_embeddings(texts, sources)


DEBUG: Initializing collection 'medical_notes' at 'chroma_db'
DEBUG: Split text into 36 chunks
DEBUG: Added embedding for source 'hypertension_in_adults_initial_drug_therapy.json', chunk 0 (ID: doc_0)
DEBUG: Added embedding for source 'hypertension_in_adults_initial_drug_therapy.json', chunk 1 (ID: doc_1)
DEBUG: Added embedding for source 'hypertension_in_adults_initial_drug_therapy.json', chunk 2 (ID: doc_2)
DEBUG: Added embedding for source 'hypertension_in_adults_initial_drug_therapy.json', chunk 3 (ID: doc_3)
DEBUG: Added embedding for source 'hypertension_in_adults_initial_drug_therapy.json', chunk 4 (ID: doc_4)
DEBUG: Added embedding for source 'hypertension_in_adults_initial_drug_therapy.json', chunk 5 (ID: doc_5)
DEBUG: Added embedding for source 'hypertension_in_adults_initial_drug_therapy.json', chunk 6 (ID: doc_6)
DEBUG: Added embedding for source 'hypertension_in_adults_initial_drug_therapy.json', chunk 7 (ID: doc_7)
DEBUG: Added embedding for source 'hypertension_in_adults

In [5]:
collection.count()

200

In [6]:
from pydantic import BaseModel, Field

class Section(BaseModel):
    title: str = Field(description="Name of the section, e.g. Pathophysiology")
    structure: str = Field(description="Structure of the section, either simply nested list, or detailed dive")
    
class StructuredOrchestratorOutput(BaseModel):
    sections: List[Section] = Field(description="List of Section JSON format, pertaining the section name and structure")

StructuredOrchestratorOutput.model_json_schema()

{'$defs': {'Section': {'properties': {'title': {'description': 'Name of the section, e.g. Pathophysiology',
     'title': 'Title',
     'type': 'string'},
    'structure': {'description': 'Structure of the section, either simply nested list, or detailed dive',
     'title': 'Structure',
     'type': 'string'}},
   'required': ['title', 'structure'],
   'title': 'Section',
   'type': 'object'}},
 'properties': {'sections': {'description': 'List of Section JSON format, pertaining the section name and structure',
   'items': {'$ref': '#/$defs/Section'},
   'title': 'Sections',
   'type': 'array'}},
 'required': ['sections'],
 'title': 'StructuredOrchestratorOutput',
 'type': 'object'}

In [7]:
def structured_llm(model: str, schema: dict):
    """
    Create a structured LLM function for JSON output.
    
    Args:
        model (str): LLM model name (e.g., 'llama3.2:3b').
        schema (dict): JSON schema for output validation.

    Returns:
        callable: Function that generates structured output.
    """
    def generate_structured(prompt: str) -> dict:
        try:
            response = ollama.chat(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                format=schema,
                options={"temperature": 0.5}
            )
            content = response["message"]["content"]
            if isinstance(content, str):
                content = json.loads(content)
            return content
        except Exception as e:
            print(f"Error in structured_llm for model {model}: {str(e)}")
            return {"sections": []}
    return generate_structured

In [8]:
ORCHESTRATOR_PROMPT = """You are an Orchestrator for medical note generation in a Zettelkasten format.
Task: Identify sections for a medical note on the topic '{topic}' based on type '{note_type}' (e.g., 'condition' or 'complaint'). Use the provided data to inform section choices:
{data}

Instructions:
- For 'condition', prioritize these sections if relevant to the data: Definition, Epidemiology, Pathophysiology, Clinical Features, Signs, Investigations (Ix), Diagnosis (Dx), Management (Mx), Complications, and topic-specific headers (e.g., 'Risk Factors for {topic}').
- For 'complaint', prioritize these sections if relevant to the data: Definition, Epidemiology, Differential Diagnosis (DDx), Salient Points of History (Hx), Physical Examination (P/E), Investigations (Ix), Management (Mx).
- For Investigations (Ix), structure as a nested list by test categories (e.g., routine bloods, microbiology, endoscopy, imaging).
- For Management (Mx), structure as a nested list by principles, goals, modalities (e.g., dietary, pharmacological), and specific treatments (e.g., medications, mechanism of action, dosing, indications, side effects).
- Only include sections supported by the data or highly relevant to the topic and note type. Add topic-specific sections (e.g., 'Initial Drug Therapy for {topic}') if the data suggests them.
- For each section, determine complexity:
  - 'simple': Sections with concise, flat lists (e.g., Definition, Epidemiology), typically requiring minimal subtopics.
  - 'complex': Sections with detailed, nested lists (e.g., Investigations, Management), typically involving subcategories or hierarchies.
  - Base complexity primarily on structure (e.g., 'nested', 'hierarchy', 'by category' imply complex; 'simple list' implies simple).
  - Use title semantics as a fallback (e.g., titles like 'Definition' or 'Overview' are simple; 'Treatment' or 'Symptoms' are complex).
- Output a JSON object with a 'sections' key containing an array of section objects, each with 'title', 'structure', and 'complexity' fields.
- Ensure the output is valid JSON with the exact structure: {{"sections": [{{"title": "string", "structure": "string", "complexity": "string"}}, ...]}}

{orchestrator_output_schema}

Example output for 'condition' with topic 'Hypertension':
{{
  "sections": [
    {{"title": "Definition", "structure": "Simple list", "complexity": "simple"}},
    {{"title": "Epidemiology", "structure": "Simple list", "complexity": "simple"}},
    {{"title": "Investigations", "structure": "Nested list by test categories: routine bloods, microbiology, endoscopy, imaging", "complexity": "complex"}},
    {{"title": "Management", "structure": "Nested list by principles, goals, modalities, specific treatments", "complexity": "complex"}},
    {{"title": "Initial Drug Therapy", "structure": "Nested list by treatment type: pharmacological, non-pharmacological", "complexity": "complex"}}
  ]
}}
"""

In [9]:
def rag_search_for_orchestrator(topic, collection):
    query = f"Give me an overview of {topic}, important information that every clinical doctor must know."
    try:
        response = ollama.embed(model="mxbai-embed-large", input=query)
        results = collection.query(
            query_embeddings=[response["embeddings"]],
            include=["documents", "metadatas"],
            n_results=15
        )
        data = "".join(results["documents"][0]) if results["documents"] else ""
        print(f"DEBUG: Retrieved {len(results['documents'][0])} documents for query '{query}'")
    except Exception as e:
        print(f"Error in RAG query: {str(e)}")
        data = ""
    return data



class NoteItem(BaseModel):
    text: str = Field(description="Org-mode formatted text, e.g., '- *Dyspnoea* [source]'")
    source: str = Field(default="", description="JSON file name, e.g., 'file.json'")
    quote: str = Field(default="", description="Direct citation from context, if applicable")
    subitems: List[Union[str, "NoteItem"]] = Field(default_factory=list, description="Second-level items or strings")

class NoteSection(BaseModel):
    title: str = Field(description="Section title, e.g., 'Clinical Features'")
    content: List[Union[str, NoteItem]] = Field(default_factory=list, description="List of items or strings (e.g., image links)")
    source: str = Field(default="Unknown", description="Combined sources")

class WorkflowState(BaseModel):
    topic: str
    note_type:str
    output_format: Literal['org', 'md'] = 'org'
    sections: List[NoteSection] = []
    retrieved_docs: Dict[str, List[Dict[str, str]]] = {}
    section_structures: Dict[str, str] = {}
        
class SectionMeta(BaseModel):
    title: str
    structure: str
    complexity: Literal['simple', 'complex']
class OrchestratorOutput(BaseModel):
    sections: List[SectionMeta]
def orchestrator(state: WorkflowState, collection: chromadb.Collection) -> WorkflowState:
    """
    Generate a list of sections for the medical note using RAG to provide context for section selection.

    Args:
        state (WorkflowState): Current workflow state.
        config (dict): Configuration with ChromaDB collection in config["configurable"]["collection"].

    Returns:
        WorkflowState: Updated state with sections and section structures.
    """
    print(f"DEBUG: Entering orchestrator for topic '{state.topic}', note_type '{state.note_type}'")
    
    if not isinstance(collection, chromadb.Collection):
        print(f"Error: Invalid collection type: {type(collection)}")
        raise TypeError("Collection must be a chromadb.Collection")

    # RAG: Retrieve broad context for section generation
    
    data = rag_search_for_orchestrator(state.topic, collection)

    # Generate sections with LLM
    structured_llm_gen = structured_llm(
        model='llama3.2',
        schema=OrchestratorOutput.model_json_schema()
    )
    prompt = ORCHESTRATOR_PROMPT.format(topic=state.topic, note_type=state.note_type, data=data, orchestrator_output_schema=OrchestratorOutput.model_json_schema())
    try:
        result = structured_llm_gen(prompt)
        if not isinstance(result, dict) or "sections" not in result:
            print(f"Error: Invalid orchestrator output: {result}")
            result = {"sections": [
                {"title": "Definition", "structure": "Simple list"},
                {"title": "Epidemiology", "structure": "Simple list"},
                {"title": "Initial Drug Therapy", "structure": "Detailed hierarchy"}
            ]}
    except Exception as e:
        print(f"Error in orchestrator LLM call: {str(e)}")
        result = {"sections": [
            {"title": "Definition", "structure": "Simple list"},
            {"title": "Epidemiology", "structure": "Simple list"},
            {"title": "Initial Drug Therapy", "structure": "Detailed hierarchy"}
        ]}

    # Update state
    state.sections = [NoteSection(title=section["title"], content=[], source="Unknown") for section in result["sections"]]
    state.section_structures = {section["title"]: section["structure"] for section in result["sections"]}
    print(f"DEBUG: Orchestrator completed with {len(state.sections)} sections")
    return state

In [164]:
query = "Overview of Hypertension"
response = ollama.embed(model="mxbai-embed-large", input=query)
results = collection.query(
    query_embeddings=[response["embeddings"][0]],
    include=["documents", "metadatas"],
    n_results=20
    )

data = "".join(results["documents"][0]) if results["documents"] else ""


structured_llm_gen = structured_llm(
    model='llama3.2',
    schema=OrchestratorOutput.model_json_schema()
    )
prompt = ORCHESTRATOR_PROMPT.format(topic="Hypertension", note_type="condition", data=data, orchestrator_output_schema=OrchestratorOutput.model_json_schema())
orchestrator_list = structured_llm_gen(prompt)

print(orchestrator_list)


{'sections': [{'title': 'Definition', 'structure': 'Simple list', 'complexity': 'simple'}, {'title': 'Epidemiology', 'structure': 'Simple list', 'complexity': 'simple'}, {'title': 'Risk Factors for Hypertension', 'structure': 'Nested list by risk factor category: lifestyle, genetic, environmental', 'complexity': 'complex'}, {'title': 'Investigations', 'structure': 'Nested list by test categories: routine bloods, microbiology, endoscopy, imaging', 'complexity': 'complex'}, {'title': 'Diagnosis of Hypertension', 'structure': 'Algorithm with diagnostic criteria', 'complexity': 'complex'}, {'title': 'Management of Hypertension', 'structure': 'Nested list by principles, goals, modalities, specific treatments', 'complexity': 'complex'}, {'title': 'Initial Drug Therapy for Hypertension', 'structure': 'Nested list by treatment type: pharmacological, non-pharmacological', 'complexity': 'complex'}]}


In [None]:
def query_rag(query: str, collection: chromadb.Collection, n_results: int = 10):
    response = ollama.embed(model="mxbai-embed-large", input=query)
    results = collection.query(
        query_embeddings=[response["embeddings"]][0],
        include=["documents", "metadatas"],
        n_results=n_results
    )
    return results


def worker_node(state: WorkflowState) -> WorkflowState:
    
    pass

class NoteStucture(BaseModel):
    format: Literal['flat', 'nested'] = Field(description="Structure of note based on complexity, where flat are list items with no nesting, and nested are those with nesting and with sub-topics.")
    subtopics: str = Field(description="String regarding the subtopics of the notes")

class SectionQueries(BaseModel):
    queries: List[str] = Field(description="List of query strings subsequently passed into RAG model to generate appropriate context.")
    org_structure: NoteStucture = Field(description="The structure that should be generated by the LLM after taking it into context")


def generate_context_for_initial_summarisation(query: str, collection: chromadb.Collection, complexity: Literal['simple', 'complex']) -> Tuple[str, dict]:
    """
    Generates a string context with source annotations and returns raw query_rag results.

    Args:
        query (str): RAG query, e.g., 'Definition of Hypertension'.
        collection (chromadb.Collection): ChromaDB collection.
        complexity (Literal['simple', 'complex']): Section complexity.

    Returns:
        Tuple[str, dict]: Context string with source annotations and raw query_rag results.
    """
    n_results = 10 if complexity == 'complex' else 5
    raw_context = query_rag(query=query, collection=collection, n_results=n_results)
    
    context_parts = []
    if raw_context.get('documents') and raw_context.get('metadatas'):
        for doc, meta in zip(raw_context['documents'][0], raw_context['metadatas'][0]):
            source = meta.get('source', 'Unknown')
            chunk_idx = meta.get('chunk_idx', 'Unknown')
            context_parts.append(f"{doc}\n[Source: {source}, Chunk: {chunk_idx}]")
    
    context_str = "\n\n".join(context_parts) if context_parts else ""
    return context_str, raw_context

def initial_summarisation(context:str, section_meta: SectionMeta, topic: str) -> NoteSection:
    prompt = FIRST_PASS_PROMPT.format(
        topic=topic,
        section_title=section_meta['title'],
        section_complexity=section_meta['complexity'],
        context=context,
        schema=NoteSection.model_json_schema()
    )
    llm = structured_llm(
        model="llama3.2",
        schema=NoteSection.model_json_schema()
    )
    result = llm(prompt)
    
    return result

def review_summary(summary, context):
    pass

def make_refinement_queries(feedback) -> List[str]:
    pass

def refine_summary(summary, context):
    pass



In [13]:
class ListOfQueries(BaseModel):
    queries: List[str] = Field(description="List of question-based RAG queries")

PARSE_STRUCTURE_PROMPT = """
You are an expert note-taker responsible for identifying questions to research the medical topic '{topic}' for the section '{section_title}'. 
You are tasked to generate queries with an eye to the section’s structure: '{section_structure}'.
Your goal is to craft concise, specific questions that will guide the retrieval of relevant information for the section’s focus.
- If the structure is simple (e.g., 'Simple list'), generate a single general question about the section.
- If the structure is complex (e.g., 'Nested list by risk factor category: age, genetics, lifestyle, environment'), identify the subtopics or categories (e.g., 'age', 'genetics') and generate a question for each, plus a general question for the section.
- Ensure questions are natural and focused, like those a diligent researcher would ask (e.g., 'What is the role of genetics in hypertension risk factors?').
- Output a JSON object with a 'queries' field containing an array of question strings.

Output JSON schema:

{schema}

Example for simple section 'Definition' with structure 'Simple list':
{{
  "queries": ["What is the definition of hypertension?"]
}}

Example for complex section 'Risk Factors for Primary (Essential) Hypertension' with structure 'Nested list by risk factor category: age, genetics, lifestyle, environment':
{{
  "queries": [
    "What are the risk factors for primary hypertension?",
    "What is the role of age in hypertension risk factors?",
    "What is the role of genetics in hypertension risk factors?",
    "What is the role of lifestyle in hypertension risk factors?",
    "What is the role of environment in hypertension risk factors?"
  ]
}}
"""


def parse_section_structure(topic: str, section_meta: dict) -> Tuple[List[str], str]:
    """
    Generates question-based RAG queries using an LLM to parse section structure.

    Args:
        topic (str): Medical topic, e.g., 'Hypertension'.
        section_meta (dict): Section configuration with title, structure, and complexity.

    Returns:
        Tuple[List[str], str]: List of question-based RAG queries and complexity ('simple' or 'complex').
    """
    complexity = section_meta['complexity']
    
    # Generate prompt
    prompt = PARSE_STRUCTURE_PROMPT.format(
        topic=topic,
        section_title=section_meta['title'],
        section_structure=section_meta['structure'],
        schema=ListOfQueries.model_json_schema()
    )
    
    # Call LLM
    structured_llm_gen = structured_llm("llama3.2", ListOfQueries.model_json_schema())
    result = structured_llm_gen(prompt)
    
    # Fallback if LLM fails
    if not result or 'queries' not in result:
        queries = [f"What is the {section_meta['title'].lower()} of {topic}?"]
    else:
        queries = result['queries']
    
    return queries, complexity

In [14]:
section_meta = result['sections'][3]
query, complexity = parse_section_structure('Hypertension', section_meta)

In [15]:
section_meta

{'title': 'Risk Factors for Hypertension',
 'structure': 'Simple list',
 'complexity': 'simple'}

In [16]:
query

['What are the general risk factors for hypertension?',
 'What is the impact of age on hypertension risk?',
 'What is the role of genetics in hypertension risk?',
 'How do lifestyle factors contribute to hypertension risk?',
 'What environmental factors increase the risk of hypertension?']

In [29]:
class DocMetaPair(BaseModel):
    doc: str = Field(description="Document text")
    meta: Dict = Field(description="Metadata with source and chunk_idx")

class RelevanceResult(BaseModel):
    is_relevant: bool = Field(description="Whether the chunk is relevant to the section or its subtopics")
    reason: str = Field(description="Brief explanation of relevance judgment")
    
RELEVANCE_AGENT_SINGLE_PROMPT = """
You are an expert note-taker evaluating a text chunk for '{topic}', section '{section_title}' with structure '{section_structure}'.
Based on the query '{query}', decide if the chunk is relevant to '{section_title}' of {topic}.
- Set is_relevant to True only if it directly addresses the query or section focus.
- Provide a brief reason for your judgment.
Output JSON:
- is_relevant: True if relevant, False otherwise.
- reason: Brief explanation.

Example for section 'Clinical Features' and query 'What are the clinical features of hypertension?':
{{
  "is_relevant": true,
  "reason": "Mentions symptoms like headaches."
}}

Chunk:
Text: {chunk}
"""



In [30]:
def query_rag_for_list_of_queries(queries: List[str], collection: chromadb.Collection =collection, n_results: int = 20):
    all_docs_and_metas: List[Dict] = []
    for query in queries:
        raw_context = query_rag(query=query, collection=collection, n_results=20)
        docs = raw_context['documents'][0]
        metas = raw_context['metadatas'][0]
        for doc, meta in zip(docs, metas):
            all_docs_and_metas.append({"doc": doc, "meta": meta, "query": query})
    unique_chunks = []
    seen = set()
    for pair in all_docs_and_metas:
        chunk_id = f"{pair['meta'].get('source', 'Unknown')}_{pair['meta'].get('chunk_idx', 'Unknown')}"
        if chunk_id not in seen:
            seen.add(chunk_id)
            unique_chunks.append(pair)
    return unique_chunks

unique_chunks = query_rag_for_list_of_queries(query)

In [32]:
topic='Hypertension'
section_title=result['sections'][3]['title']
section_structure=result['sections'][3]['structure']
schema=RelevanceResult.model_json_schema()

In [None]:
def extract_relevant_chunks_by_relevance_agent(unique_chunks: List[Dict], topic, section_title, section_structure, max_chunk=5):
    relevant_chunks = []
    for chunk in unique_chunks:
        llm = structured_llm(
            model="llama3.2",
            schema=RelevanceResult.model_json_schema()
        )
        if len(relevant_chunks) >= max_chunk:
            break
        else:
            prompt = RELEVANCE_AGENT_SINGLE_PROMPT.format(
                topic=topic,
                section_title=section_title,
                schema=schema,
                section_structure=section_structure,
                query=chunk['query'],
                chunk=chunk['doc'],
            )

            relavence_results = llm(prompt)
            if relavence_results['is_relevant']:
                relevant_chunks.append(chunk)

    return relevant_chunks

In [36]:
unique_chunks

[{'doc': '●Obesity – Obesity and weight gain are major risk factors for hypertension and are also determinants of the rise in blood pressure that is commonly observed with aging [[16,17](/contents/overview-of-hypertension-in-adults/abstract/16,17)]. (See  ["Overweight, obesity, and weight reduction in hypertension"](/contents/overweight-obesity-and-weight-reduction-in-hypertension?search=hypertension&topicRef=3852&source=see_link).)\n\n●Family history – Hypertension is approximately twice as common in subjects who have one or two hypertensive parents, and multiple epidemiologic studies suggest that genetic factors account for approximately 30 percent of the variation in blood pressure in various populations [[18,19](/contents/overview-of-hypertension-in-adults/abstract/18,19)]. (See  ["Genetic factors in the pathogenesis of hypertension"](/contents/genetic-factors-in-the-pathogenesis-of-hypertension?search=hypertension&topicRef=3852&source=see_link).)\n\n●Race – Hypertension tends to b

In [40]:
relevant_chunks = extract_relevant_chunks_by_relevance_agent(unique_chunks=unique_chunks, topic=topic, section_title=section_title, section_structure=section_structure)

In [42]:
for chunk in relevant_chunks:
    print(chunk['doc'])

●Obesity – Obesity and weight gain are major risk factors for hypertension and are also determinants of the rise in blood pressure that is commonly observed with aging [[16,17](/contents/overview-of-hypertension-in-adults/abstract/16,17)]. (See  ["Overweight, obesity, and weight reduction in hypertension"](/contents/overweight-obesity-and-weight-reduction-in-hypertension?search=hypertension&topicRef=3852&source=see_link).)

●Family history – Hypertension is approximately twice as common in subjects who have one or two hypertensive parents, and multiple epidemiologic studies suggest that genetic factors account for approximately 30 percent of the variation in blood pressure in various populations [[18,19](/contents/overview-of-hypertension-in-adults/abstract/18,19)]. (See  ["Genetic factors in the pathogenesis of hypertension"](/contents/genetic-factors-in-the-pathogenesis-of-hypertension?search=hypertension&topicRef=3852&source=see_link).)

●Race – Hypertension tends to be more common,

In [47]:
SUMMARY_PROMPT = """
You are a medical note-taker creating Org-mode notes for '{topic}', section '{section_title}' with structure '{section_structure}'. You are given some relevant context, with the sources in square brackets ([source name]).
Summarize the context into a concise bullet-point list:
- Follow '{section_structure}' (e.g., 'Simple list' or 'Nested list by category: symptoms, signs').
- Use Org-mode: *bold* or _underscore_ for key terms (e.g., *Hypertension*, _Dyspnoea_).
- Append colon (:) to items with subitems (e.g., '- *Symptoms*:').
- Support up to two nesting levels.
- Include [source] for each point (e.g., [file.json]).
- Use 'quote' for direct citations.
- Ground all points in context; no hallucination.
- If no content, use '- No relevant content [Unknown]'.

Context:
{context}

Output JSON (NoteSection schema):
- title: '{section_title}'.
- content: Array of NoteItem or strings.
- source: Comma-separated sources or 'Unknown'.

Example for 'Clinical Features' (Simple list):
{{
  "title": "Clinical Features",
  "content": [
    {{"text": "- *Hypertension* is _asymptomatic_ [file.json]", "source": "file.json", "quote": "Often without symptoms.", "subitems": []}}
  ],
  "source": "file.json"
}}
"""

def initial_summarisation(
    relevant_chunks: List[Dict],
    section_meta: Dict,
    topic: str
) -> NoteSection:
    """
    Generates an initial Org-mode formatted summary from relevant chunks.

    Args:
        relevant_chunks: List of dictionaries with 'doc', 'meta', and 'query'.
        section_meta: Section configuration with title, structure, and complexity.
        topic: Medical topic, e.g., 'Hypertension'.

    Returns:
        NoteSection: Initial summary in Org-mode format.
    """
    # Build context string
    context_parts = []
    for chunk in relevant_chunks:
        doc = chunk['doc']
        source = chunk['meta'].get('source', 'Unknown')
        chunk_idx = chunk['meta'].get('chunk_idx', 'Unknown')
        context_parts.append(f"{doc}\n[Source: {source}, Chunk: {chunk_idx}]")
    context_str = "\n\n".join(context_parts) if context_parts else ""

    # Validate section_meta
    section_title = section_meta.get('title', 'Unknown')
    section_structure = section_meta.get('structure', 'Simple list')
    section_complexity = section_meta.get('complexity', 'simple')

    # Prepare prompt
    prompt = SUMMARY_PROMPT.format(
        topic=topic,
        section_title=section_title,
        section_structure=section_structure,
        section_complexity=section_complexity,
        context=context_str
    )

    # Call LLM
    llm = structured_llm("llama3.2", NoteSection.model_json_schema())
    result = llm(prompt)
    return result

In [52]:
note_section = initial_summarisation(relevant_chunks=relevant_chunks, section_meta=section_meta, topic=topic)

In [153]:
class ValidationResult(BaseModel):
    is_valid: bool = Field(description="True if no hallucination, correct formatting, and no significant gaps")
    feedback: List[str] = Field(default_factory=list, description="List of issues (e.g., 'Point not grounded', 'Missing subtopic: signs')")
    follow_up_questions: List[str] = Field(default_factory=list, description="Queries to address knowledge gaps")

VALIDATION_PROMPT = """
You are a medical note-taker reviewing a Zettelkasten-style Org-mode summary for '{topic}', section '{section_title}' with structure '{section_structure}' and complexity '{section_complexity}'.
Your task is to:
1. Check for hallucination: Ensure each NoteItem.text and NoteItem.quote is grounded in the context (i.e., appears in a chunk).
2. Verify structure: Confirm the summary follows '{section_structure}' (e.g., includes all subtopics like 'symptoms, signs' for nested lists).
3. Check formatting: Ensure Org-mode syntax with '- ' for bullets, *bold* or _underscore_ for key terms, [source] for each point, colon (:) for items with subitems, and two-space indent for subitems.
4. Clarify ambiguity: raise points which may appear ambiguous. This should influence whether additional follow up questions are required.
4. Identify gaps: List missing subtopics from '{section_structure}' (e.g., 'signs' not covered).
5. Generate follow-up questions: Create concise queries to address gaps.

Context:
{context}

Summary:
{summary}

Output JSON (ValidationResult schema):
- is_valid: Return False if there are any issues with the summary. Only return True if it matches all the criteria.
- feedback: List of issues (e.g., "Point not grounded", "Missing subtopic: signs", "Incorrect formatting", "Point ambiguous").
- follow_up_questions: List of queries to address gaps.

Example for 'Clinical Features' with structure 'Nested list by category: symptoms, signs':
{{
  "is_valid": false,
  "feedback": ["Point '_Retinopathy_ on fundoscopy' not grounded", "Missing subtopic: signs", "Correct Org-mode formatting"],
  "follow_up_questions": ["What are the physical signs of hypertension?"]
}}

Example for valid 'Clinical Features' with structure 'Simple list':
{{
  "is_valid": true,
  "feedback": ["All points grounded", "Correct formatting", "All subtopics covered"],
  "follow_up_questions": []
}}
"""

def validate_summary(
    note_section: NoteSection,
    relevant_chunks: List[Dict],
    section_meta: Dict,
    topic: str
) -> ValidationResult:
    """
    Validates a NoteSection against relevant chunks, providing feedback and follow-up questions.

    Args:
        note_section: NoteSection object from initial_summarisation.
        relevant_chunks: List of dictionaries with 'doc', 'meta', and 'query'.
        section_meta: Section configuration with title, structure, and complexity.
        topic: Medical topic, e.g., 'Hypertension'.

    Returns:
        ValidationResult: Feedback and follow-up questions.
    """
    # Build context string
    context_parts = []
    for chunk in relevant_chunks:
        doc = chunk['doc']
        source = chunk['meta'].get('source', 'Unknown')
        chunk_idx = chunk['meta'].get('chunk_idx', 'Unknown')
        context_parts.append(f"{doc}\n[Source: {source}, Chunk: {chunk_idx}]")
    context_str = "\n\n".join(context_parts) if context_parts else ""

    # Convert note_section to JSON string
    summary_str = json.dumps(note_section, indent=2)

    # Validate section_meta
    section_title = section_meta['title']
    section_structure = section_meta['structure']
    section_complexity = section_meta['complexity']

    # Prepare prompt
    prompt = VALIDATION_PROMPT.format(
        topic=topic,
        section_title=section_title,
        section_structure=section_structure,
        section_complexity=section_complexity,
        context=context_str,
        summary=summary_str
    )

    # Call LLM
    llm = structured_llm("llama3.2", ValidationResult.model_json_schema())
    result = llm(prompt)

    return result

In [88]:
validation_results = validate_summary(
    note_section, relevant_chunks, section_meta, 'Hypertension'
)

In [89]:
validation_results

{'is_valid': False,
 'feedback': ["Point '*Family history*' not grounded in the provided context",
  "Missing subtopic: 'signs' and 'symptoms'",
  "Incorrect Org-mode formatting for '*Obesity*': should be '- *Obesity*: ...'",
  "Point '*Reduced nephron number*' ambiguous without further clarification",
  "Missing subtopic: 'diagnosis' and 'differential diagnosis'",
  "Point '*Excessive alcohol consumption*' not grounded in the provided context"],
 'follow_up_questions': ['What are the physical signs of hypertension?',
  'How does reduced nephron number predispose to hypertension?',
  'Can you provide more information on family history as a risk factor for hypertension?']}

In [49]:
def visualize_note_section(note_section: Dict, output_file: Optional[str] = None) -> None:
    """
    Prints the text content of a NoteSection (in dictionary form) line by line, preserving nested list structure.
    Optionally writes to a file in Org-mode format.

    Args:
        note_section (Dict): Dictionary with 'title' and 'content' (list of NoteItem dicts or strings).
        output_file (Optional[str]): Path to save Org-mode output. If None, prints to console.
    """
    # Validate NoteSection
    try:
        validated_section = NoteSection(**note_section)
    except:
        print("* Invalid NoteSection *")
        print("  - Error: Invalid note_section format")
        if output_file:
            with open(output_file, 'w') as f:
                f.write("* Invalid NoteSection *\n  - Error: Invalid note_section format\n")
        return

    # Initialize output
    output_lines = [f"* {validated_section.title} *"]

    # Recursively print content
    def print_item(item: Union[NoteItem, str], level: int) -> None:
        indent = "  " * level
        if isinstance(item, NoteItem):
            output_lines.append(indent + item.text)
            for subitem in item.subitems:
                print_item(subitem, level + 1)
        elif isinstance(item, str):
            output_lines.append(indent + item)
        else:
            output_lines.append(f"{indent}- Invalid item [Unknown]")

    # Process content
    if not validated_section.content:
        output_lines.append("  - No content available [Unknown]")
    else:
        for item in validated_section.content:
            print_item(item, 1)

    # Output to console and/or file
    for line in output_lines:
        print(line)
    if output_file:
        with open(output_file, 'w') as f:
            f.write("\n".join(output_lines) + "\n")

In [92]:
section_structure

'Simple list'

In [95]:
def get_relevant_chunks(queries, topic, section_title, section_structure):
    unique_chunks = query_rag_for_list_of_queries(queries=queries, collection=collection)
    relevant_chunks = extract_relevant_chunks_by_relevance_agent(unique_chunks=unique_chunks, topic=topic, section_title=section_title, section_structure=section_structure)
    
    return relevant_chunks

relevant_chunks = get_relevant_chunks(validation_results['follow_up_questions'], topic, section_title, section_structure)


In [96]:
relevant_chunks

[{'doc': '●Reduced nephron number – Reduced adult nephron mass may predispose to hypertension, which may be related to genetic factors, intrauterine developmental disturbance (eg, hypoxia, drugs, nutritional deficiency), premature birth, and postnatal environment (eg, malnutrition, infections). (See  ["Possible role of low birth weight in the pathogenesis of primary (essential) hypertension"](/contents/possible-role-of-low-birth-weight-in-the-pathogenesis-of-primary-essential-hypertension?search=hypertension&topicRef=3852&source=see_link).)\n\n●High-sodium diet – Excess dietary sodium intake and accompanying decreased potassium intake increase the risk of hypertension. (See  ["Salt intake and hypertension"](/contents/salt-intake-and-hypertension?search=hypertension&topicRef=3852&source=see_link) and  ["Potassium and hypertension"](/contents/potassium-and-hypertension?search=hypertension&topicRef=3852&source=see_link).)\n\n●Excessive alcohol consumption – Excess alcohol intake is associ

In [107]:
REFINEMENT_PROMPT = """
You are a medical note-taker refining a Zettelkasten-style Org-mode summary for '{topic}', section '{section_title}' with structure '{section_structure}' and complexity '{section_complexity}'.
Your task is to revise the summary based on feedback and new context:
1. Remove hallucinated points listed in feedback (e.g., ungrounded points).
2. Add new points from new context to cover missing subtopics (e.g., 'signs').
3. Clarify ambiguous or incomplete points noted in feedback.
4. Fix formatting issues (e.g., missing colon, incorrect indentation).
5. Merge redundant points into concise entries.
6. No hallucinations, unless explicitly stated that a point is not grounded within feedback, assume that the point is grounded in evidence.
7. Highlight key terms with *bold* or _underscore_ where missing.
- Follow '{section_structure}' (e.g., 'Nested list by category: symptoms, signs').
- Use Org-mode: '- ' for bullets, *bold* or _underscore_, [source], colon (:) for subitems, two-space indent.
- Ground all points in current or new context; no hallucination.
- Preserve valid existing points unless flagged for revision.

Feedback:
{feedback}

New Context:
{new_context}

Current Summary:
{summary}

Output JSON (NoteSection schema):
- title: '{section_title}'.
- content: Array of NoteItem or strings.
- source: Comma-separated sources or 'Unknown'.

{schema}
"""


def refine_summary(
    note_section: NoteSection,
    validation_result: ValidationResult,
    new_relevant_chunks: List[Dict],
    section_meta: Dict,
    topic: str
) -> NoteSection:
    """
        Refines a NoteSection based on validation feedback and new chunks.
        
        Args:
        note_section: Current NoteSection object.
        validation_result: ValidationResult with feedback and follow-up questions.
        current_relevant_chunks: Chunks used in the current summary.
        new_relevant_chunks: New chunks from follow-up queries.
        section_meta: Section configuration with title, structure, and complexity.
        topic: Medical topic, e.g., 'Hypertension'.
        
        Returns:
        NoteSection: Refined summary.
        """

    new_context_parts = []
    for chunk in new_relevant_chunks:
        doc = chunk['doc']
        source = chunk['meta'].get('source', 'Unknown')
        chunk_idx = chunk['meta'].get('chunk_idx', 'Unknown')
        new_context_parts.append(f"{doc}\n[Source: {source}, Chunk: {chunk_idx}]")
    new_context = "\n\n".join(new_context_parts) if new_context_parts else ""

    # Convert inputs to JSON
    summary_str = json.dumps(note_section, indent=2)
    feedback_str = "\n".join([f"- {item}" for item in validation_result['feedback']])

    # Validate section_meta
    section_title = section_meta['title']
    section_structure = section_meta['structure']
    section_complexity = section_meta['complexity']

    # Prepare prompt
    prompt = REFINEMENT_PROMPT.format(
        topic=topic,
        section_title=section_title,
        section_structure=section_structure,
        section_complexity=section_complexity,
        feedback=feedback_str,
        new_context=new_context,
        schema=NoteSection.model_json_schema(),
        summary=summary_str
    )

    # Call LLM
    llm = structured_llm("llama3.2", NoteSection.model_json_schema())
    result = llm(prompt)
    return result

In [108]:
refined_summary = refine_summary(note_section=note_section, validation_result=validation_results, new_relevant_chunks=relevant_chunks, section_meta=section_meta, topic=topic)

In [158]:
def worker_node(
    section_meta: Dict,
    topic: str,
    collection: chromadb.Collection,
    max_iterations: int = 3
) -> Dict:
    """
    Executes the worker node cycle to generate and refine a note section.

    Args:
        section_meta: Section configuration with title, structure, complexity.
        topic: Medical topic, e.g., 'Hypertension'.
        collection: ChromaDB collection.
        max_iterations: Maximum number of refinement cycles.

    Returns:
        Dict: Final refined note section.
    """
    # Step 1: Parse section structure
    initial_queries, _ = parse_section_structure(topic=topic, section_meta=section_meta)
    section_title = section_meta['title']
    section_structure = section_meta['structure']

    # Step 2: Initial retrieval
    initial_chunks = get_relevant_chunks(
        topic=topic,
        section_title=section_title,
        section_structure=section_structure,
        queries=initial_queries,
    )

    # Handle no relevant context
    if not initial_chunks:
        print("No relevant chunks found; returning default summary")
        default_summary = {
            "title": section_title,
            "content": [{
                "text": "- No relevant content found [Unknown]",
                "source": "",
                "quote": "",
                "subitems": []
            }],
            "source": "Unknown"
        }
        visualize_note_section(default_summary, output_file="notes.org")
        return default_summary

    # Step 3: Initial summarization
    summary = initial_summarisation(
        relevant_chunks=initial_chunks,
        section_meta=section_meta,
        topic=topic
    )

    # Step 4: Validation and refinement loop
    iteration = 0
    current_queries = initial_queries.copy()

    while iteration < max_iterations:
        print(f"\nIteration {iteration + 1}/{max_iterations}")

        # Validation
        validation = validate_summary(
            note_section=summary,
            relevant_chunks=initial_chunks,  # Use initial chunks for validation
            section_meta=section_meta,
            topic=topic
        )

        # Check stopping criteria
        if validation['is_valid']:
            print("Summary validated successfully")
            return summary

        print("Validation issues:", validation['feedback'])
        print("Follow up questions:", validation['follow_up_questions'])
        if not validation['follow_up_questions']:
            print("No follow-up queries generated; stopping cycle")
            return summary

        # Refinement
        # Retrieve new chunks for follow-up questions
        new_relevant_chunks = get_relevant_chunks(
            topic=topic,
            section_title=section_title,
            section_structure=section_structure,
            queries=validation['follow_up_questions'],
        )

        summary = refine_summary(
            note_section=summary,
            validation_result=validation,
            new_relevant_chunks=new_relevant_chunks,  # New context for gaps
            section_meta=section_meta,
            topic=topic
        )

        # Update queries
        current_queries = list(set(validation['follow_up_questions']))  # Deduplicate
        print("Follow-up queries:", current_queries)

        iteration += 1

    print("Max iterations reached; using last summary")
    return summary

In [159]:
summary = worker_node(topic='Hypertension', section_meta=section_meta, collection=collection)


Iteration 1/3
Validation issues: ["Point '*Family history*, genetic factors account for approximately 30 percent of the variation in blood pressure' not grounded", "Missing subtopic: 'signs'", 'Incorrect Org-mode formatting (missing colon between items)', "Point '*Excessive alcohol consumption*- Excess alcohol intake is associated with the development of hypertension, and alcohol restriction lowers blood pressure in those with increased intake' ambiguous"]
Follow up questions: ['What are the physical signs of hypertension?', 'Can you provide more information on the genetic factors contributing to hypertension?']
Follow-up queries: ['Can you provide more information on the genetic factors contributing to hypertension?', 'What are the physical signs of hypertension?']

Iteration 2/3
Validation issues: ["Point '[Source: overview_of_hypertension_in_adults.json, Chunk: 21]' ambiguous", "Missing subtopic: 'Family history' and 'Signs'", "Incorrect formatting for '- *Major causes of secondary

In [167]:
for section_meta in orchestrator_list['sections']:
    summary = worker_node(topic='Hypertension', section_meta=section_meta, collection=collection)
    visualize_note_section(summary)


Iteration 1/3
Validation issues: ["Point 'Stage 1 hypertension' not grounded", 'Missing subtopic: signs', 'Incorrect formatting']
Follow up questions: ['What are the physical signs of hypertension?']
Follow-up queries: ['What are the physical signs of hypertension?']

Iteration 2/3
Validation issues: ["Point '*Hypertension*' not grounded", 'Missing subtopic: signs', "Incorrect Org-mode syntax for bullet points (should be '- ')", "Ambiguous point '*Stage 2 hypertension*: Systolic ≥140 mmHg or diastolic ≥90 mmHg.' (should specify exactly what values are included in this range)"]
Follow up questions: ['What are the physical signs of hypertension?', "Can you clarify the definition of '*Stage 2 hypertension*'?"]
Follow-up queries: ['What are the physical signs of hypertension?', "Can you clarify the definition of '*Stage 2 hypertension*'?"]

Iteration 3/3
Validation issues: ["Point '- *Definition*: The following definitions and staging system, which are based upon appropriately measured bl

In [166]:
orchestrator_list

{'sections': [{'title': 'Definition',
   'structure': 'Simple list',
   'complexity': 'simple'},
  {'title': 'Epidemiology',
   'structure': 'Simple list',
   'complexity': 'simple'},
  {'title': 'Risk Factors for Hypertension',
   'structure': 'Nested list by risk factor category: lifestyle, genetic, environmental',
   'complexity': 'complex'},
  {'title': 'Investigations',
   'structure': 'Nested list by test categories: routine bloods, microbiology, endoscopy, imaging',
   'complexity': 'complex'},
  {'title': 'Diagnosis of Hypertension',
   'structure': 'Algorithm with diagnostic criteria',
   'complexity': 'complex'},
  {'title': 'Management of Hypertension',
   'structure': 'Nested list by principles, goals, modalities, specific treatments',
   'complexity': 'complex'},
  {'title': 'Initial Drug Therapy for Hypertension',
   'structure': 'Nested list by treatment type: pharmacological, non-pharmacological',
   'complexity': 'complex'}]}

In [162]:
visualize_note_section(summary)

* *Risk Factors for Hypertension* *
  - *Obesity*- Obesity and weight gain are major risk factors for hypertension [16,17](/contents/overview-of-hypertension-in-adults/abstract/16,17).
  - *_Family history_*: Hypertension is approximately twice as common in subjects who have one or two hypertensive parents, with genetic factors accounting for approximately 30 percent of the variation in blood pressure [18,19](/contents/genetic-factors-in-the-pathogenesis-of-hypertension?search=hypertension&topicRef=3852&source=see_link).
  - *Race*: Hypertension tends to be more common and severe in Black patients, with an earlier onset and greater target-organ damage [20].
  - *_Signs_*: Hypertensive signs include hypertension, orthostatic hypotension, and target organ damage [22].
  - *High-sodium diet*: Excess dietary sodium intake increases the risk of hypertension. A low-potassium diet can also contribute to high blood pressure [21].
  - *_Treatment of resistant hypertension_*: Novel therapies, su