In [None]:
import re
from typing import List
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

In [35]:
import json
file = "json_data/hypertension_in_adults_initial_drug_therapy.json"
with open(file, 'r', encoding='utf-8') as f:
    data = json.load(f)




In [3]:
data

{'metadata': {'title': 'Overview of hypertension in adults',
  'url': 'https://www.uptodate.com.eproxy.lib.hku.hk/contents/overview-of-hypertension-in-adults?search=hypertension&source=search_result&selectedTitle=1~150&usage_type=default&display_rank=1#topicGraphics',
  'lastUpdated': None,
  'authors': [],
  'rank': 1,
  'extractedAt': '2025-07-19T06:47:46.076Z',
  'savedAt': '2025-07-19T06:47:55.449Z',
  'searchPattern': 'hypertension',
  'rankInResults': 1},
 'content': {'markdown': '- \n - Find in topic\n  - Formulary\n - [Print](/contents/overview-of-hypertension-in-adults/print?search=hypertension&source=search_result&selectedTitle=1~150&usage_type=default&display_rank=1)\n -   \n -    Feedback\n - [](javascript:void(0)) \n\n    ## GRAPHICS [View All](/contents/overview-of-hypertension-in-adults?search=hypertension&source=search_result&selectedTitle=1~150&usage_type=default&display_rank=1#)\n\n## CALCULATORS\n\n## RELATED TOPICS\n\n Overview of hypertension in adults  Authors:[Ja

In [36]:
md_text = data['content']['markdown']

In [None]:
print(md_text)

'- \n - Find in topic\n  - Formulary\n - [Print](/contents/overview-of-hypertension-in-adults/print?search=hypertension&source=search_result&selectedTitle=1~150&usage_type=default&display_rank=1)\n -   \n -    Feedback\n - [](javascript:void(0)) \n\n    ## GRAPHICS [View All](/contents/overview-of-hypertension-in-adults?search=hypertension&source=search_result&selectedTitle=1~150&usage_type=default&display_rank=1#)\n\n## CALCULATORS\n\n## RELATED TOPICS\n\n Overview of hypertension in adults  Authors:[Jan Neil Basile, MD](/contents/overview-of-hypertension-in-adults/contributors)[Michael J Bloch, MD, FACP, FASH, FSVM, FNLA](/contents/overview-of-hypertension-in-adults/contributors) Section Editor:[William B White, MD](/contents/overview-of-hypertension-in-adults/contributors) Deputy Editors:[Karen Law, MD, FACP](/contents/overview-of-hypertension-in-adults/contributors)[John P Forman, MD, MSc](/contents/overview-of-hypertension-in-adults/contributors)[Contributor Disclosures](/contents

In [None]:
def split_markdown_by_full_caps(markdown_content: str) -> List[Document]:
    # Regex to match headings: optional \n\n or start, full caps text, optional " —", optional \n\n, content until next heading or end
    pattern = r'(?:(?:\n\n|^)([A-Z\s]+)(?: — )?\n\n?(.*?)(?=\n\n[A-Z\s]+(?: — )?\n\n?|$))'
    matches = re.finditer(pattern, markdown_content, re.DOTALL)

    documents = []
    for match in matches:
        heading = match.group(1).strip()
        content = match.group(2).strip()
        if heading:  # Ensure heading exists
            documents.append(Document(
                page_content=f"{heading}\n{content}",
                metadata={"heading": heading}
            ))
    
    return documents

In [37]:
documents = split_markdown_by_full_caps(md_text)
len(documents)

9

In [38]:
documents

[Document(metadata={'heading': 'DEFINITIONS', 'source': 'uptodate_data/hypertension.json'}, page_content='DEFINITIONS\nDiagnosing hypertension requires a series of repeated blood pressure measurements either in the office or using ambulatory blood pressure monitoring or self-measured blood pressure  ([table 1](/contents/image?imageKey=PC%2F147001&topicKey=PC%2F3869&search=hypertension&rank=2%7E150&source=see_link)). (See  ["Ambulatory blood pressure monitoring: Indications and procedure"](/contents/ambulatory-blood-pressure-monitoring-indications-and-procedure?search=hypertension&topicRef=3869&source=see_link) and  ["Hypertension in adults: Blood pressure measurement and diagnosis"](/contents/hypertension-in-adults-blood-pressure-measurement-and-diagnosis?search=hypertension&topicRef=3869&source=see_link).)\n\nWe use the American College of Cardiology/American Heart Association (ACC/AHA) definitions and staging system for hypertension [[4](/contents/hypertension-in-adults-initial-drug-

In [39]:
def filter_documents_before_guidelines(documents: List[Document]) -> List[Document]:
    """
    Filters a list of Document objects, dropping all documents after the one with
    the heading 'SOCIETY GUIDELINE LINKS'.

    Args:
        documents: List of Document objects with metadata containing 'heading' and 'source'.

    Returns:
        List of Document objects before 'SOCIETY GUIDELINE LINKS'.
    """
    for i, doc in enumerate(documents):
        if doc.metadata.get("heading") == "SOCIETY GUIDELINE LINKS":
            return documents[:i]
    return documents

documents = filter_documents_before_guidelines(documents=documents)

len(documents)

5

In [None]:
# Define prompts for iterative refinement
SYSTEM_PROMPT = """
You are an expert medical note-taker summarizing UpToDate articles. Produce concise, accurate summaries in markdown format, with each section under a level-3 heading (### {heading}) followed by a nested bullet-point list. Organize content hierarchically where level of nesting represents increasing detail, as in:
- **Subtopic** - Detail...:
  - *Sub-detail 1* - Further detail...
  - *Sub-detail 2* - Further detail....
Preserve the article’s full caps headings (e.g., DEFINITIONS). Only use provided text, cite the source (filename) in each section’s summary, and ignore image links (e.g., [[file:img/...]]). Focus on clinical insights like etiology, treatments, and diagnostics. Ensure completeness by addressing these questions if relevant: {questions}.
"""

INITIAL_PROMPT = ChatPromptTemplate.from_messages([
    ("system", SYSTEM_PROMPT),
    ("human", """
You are summarizing the {heading} section of an UpToDate article. Summarize the following content in markdown format under a level-3 heading (### {heading}), using a nested bullet-point list to organize key points hierarchically (e.g., - *Subtopic* - Detail...; - _Sub-detail_ - Further detail...). Provide a concise overview, citing the source ({source}) at the end of the top-level bullet list. Ignore image links (e.g., [[file:img/...]]). Only use the provided text:
{page_content}
""")
])

REFINE_PROMPT = ChatPromptTemplate.from_messages([
    ("system", SYSTEM_PROMPT),
    ("human", """
You are summarizing the {heading} section of an UpToDate article. Here’s the current summary in markdown format:
{existing_summary}

Incorporate this new content:
{page_content}

Update the summary under the level-3 heading (### {heading}), maintaining a nested bullet-point list with hierarchical organization (e.g., - *Subtopic* - Detail...; - _Sub-detail_ - Further detail...). Cite the source ({source}) at the end of the top-level bullet list. Ignore image links (e.g., [[file:img/...]]). Does this new content answer any of these questions: {questions}? Include relevant answers in the summary. Only use provided text.
""")
])


In [None]:
def summarize_document(doc: Document, llm) -> str:
    heading = doc.metadata.get("heading", "UNKNOWN")
    source = doc.metadata.get("source", "unknown")
    
    # Split document content into chunks, prioritizing paragraphs
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        separators=["\n\n", "\n", " ", ""]
    )
    content_chunks = text_splitter.split_text(doc.page_content)
    chunk_docs = [Document(page_content=chunk, metadata=doc.metadata) for chunk in content_chunks]
    
    # Set up refine chain
    chain = load_summarize_chain(
        llm=llm,
        chain_type="refine",
        question_prompt=INITIAL_PROMPT,
        refine_prompt=REFINE_PROMPT,
        return_intermediate_steps=False,
        input_key="input_documents",
        output_key="output_text"
    )
    
    # Run the chain with heading and questions
    summary = chain.invoke({
        "input_documents": chunk_docs,
        "heading": heading,
        "source": source,
        "questions": ", ".join(RUNNING_QUESTIONS)
    })["output_text"]
    
    return f"- *{heading}* - {summary} [{source}]"

In [None]:
def summarize_json(markdown_content: str) -> str:
    if not markdown_content:
        return "Error: No content loaded"
    
    # Split and filter documents
    documents = split_markdown_by_full_caps(markdown_content)
    filtered_documents = filter_documents_before_guidelines(documents)
    if not filtered_documents:
        return "Error: No sections parsed or all sections after SOCIETY GUIDELINE LINKS"
    
    # Initialize LLM
    llm = ChatOllama(model="llama3.2", base_url="http://localhost:11434")
    
    # Summarize each document
    summaries = [summarize_document(doc, llm) for doc in filtered_documents]
    
    # Combine summaries into a single markdown string
    return "\n".join(summaries)