# Setting up the Project

In [123]:
from agno.models.azure import AzureOpenAI
from agno.agent import Agent
import os

model_name = "gpt-4.1-mini"
api_version="2025-04-01-preview"
endpoint = ""
api_key = ""

os.environ["AZURE_OPENAI_API_KEY"] = api_key
os.environ["AZURE_OPENAI_ENDPOINT"] = endpoint
os.environ["OPENAI_API_VERSION"] = api_version

In [126]:
agent = Agent(
    model=AzureOpenAI(id=model_name, temperature=0),
    description="You are an enthusiastic news reporter with a flair for storytelling!",
)
res = agent.run("Tell me about a breaking news story from Aachen.")
print(res.content)

Breaking news from Aachen! Earlier today, the historic city of Aachen witnessed an unexpected event as a rare archaeological discovery was made during construction work near the iconic Aachen Cathedral. Workers unearthed ancient Roman artifacts believed to date back over 1,500 years, including pottery, coins, and tools. Experts from the local university have rushed to the site to begin a detailed excavation, calling it a "remarkable window into Aachen's early history." The city officials have temporarily halted construction to preserve the site, and plans are underway to create a small museum exhibit to showcase these incredible finds. Stay tuned for more updates on this fascinating story!


# Setting up the vector database and the knowledge base

## Embeddings

In [111]:
from agno.embedder.azure_openai import AzureOpenAIEmbedder

embeddings = AzureOpenAIEmbedder().get_embedding(
    "Center for Computational Life Sciences"
)

print(f"Embeddings: {embeddings[:10]}")
print(f"Dimensions: {len(embeddings)}")

Embeddings: [0.016539961, -0.025933826, 0.048515484, 0.03760145, -0.04732014, -0.002327353, -0.010972504, 0.029467896, -0.014123281, -0.006298308]
Dimensions: 1536


## Vector database

In [139]:
from agno.vectordb.lancedb import LanceDb

vector_db = LanceDb(
    table_name="ccls",
    uri="./lancedb_data",
    search_type="hybrid",
    embedder=AzureOpenAIEmbedder()
)
print(vector_db.table_name)
print(vector_db.embedder)
print(vector_db.uri)

ccls
AzureOpenAIEmbedder(dimensions=1536, id='text-embedding-3-small', encoding_format='float', user=None, api_key=None, api_version='2024-10-21', azure_endpoint=None, azure_deployment=None, base_url=None, azure_ad_token=None, azure_ad_token_provider=None, organization=None, request_params=None, client_params=None, openai_client=None)
./lancedb_data


## Knowledge Base

In [140]:
import pandas as pd
from agno.knowledge.pdf import PDFKnowledgeBase, PDFReader

pdf_knowledge_base = PDFKnowledgeBase(
    path="heart_failure_review.pdf",
    vector_db=vector_db,
    reader=PDFReader(chunk=True),
)
print(pdf_knowledge_base.path)
print(pdf_knowledge_base.vector_db)
print(pdf_knowledge_base.chunking_strategy)

heart_failure_review.pdf
<agno.vectordb.lancedb.lance_db.LanceDb object at 0x129f86b50>
<agno.document.chunking.fixed.FixedSizeChunking object at 0x12f326390>


# Chunking the data

## Fixed Size chunking

In [141]:
print(pdf_knowledge_base.chunking_strategy)
chunks = list(pdf_knowledge_base.document_lists)[0]
print(len(chunks))
for chunk in chunks[:3]:
    print(f"{chunk.content[:100]}")
    print(f"Chunk metadata: {chunk.meta_data}")

<agno.document.chunking.fixed.FixedSizeChunking object at 0x12f326390>


29
Vol.:(0123456789)1 3 h ttps://doi.org/10.1007/s10741-021-10105-w Biomarkers for the diagnosis and ma
Chunk metadata: {'page': 1, 'chunk': 1, 'chunk_size': 4137}
1 3 help refine the management of patients with HF and further improve their prognosis. Characterist
Chunk metadata: {'page': 2, 'chunk': 1, 'chunk_size': 4895}
1 3 Diagnosis In the Breathing Not Properly study, which included 1586 patients admitted to the emer
Chunk metadata: {'page': 3, 'chunk': 1, 'chunk_size': 2036}


In [91]:
print(chunks[0].content)

Vol.:(0123456789)1 3
h
ttps://doi.org/10.1007/s10741-021-10105-w
Biomarkers for the diagnosis and management of heart failure
Vincenzo Castiglione1 · Alberto Aimo1,2  · Giuseppe Vergaro1,2 · Luigi Saccaro1 · Claudio Passino1,2 · 
Michele Emdin1,2
Accepted: 6 April 2021 
© The Author(s) 2021
Abstract
Heart failure (HF) is a significant cause of morbidity and mortality worldwide. Circulating biomarkers reflecting pathophysi-
ological pathways involved in HF development and progression may assist clinicians in early diagnosis and management of 
HF patients. Natriuretic peptides (NPs) are cardioprotective hormones released by cardiomyocytes in response to pressure 
or volume overload. The roles of B-type NP (BNP) and N-terminal pro-B-type NP (NT-proBNP) for diagnosis and risk 
stratification in HF have been extensively demonstrated, and these biomarkers are emerging tools for population screening 
and as guides to the start of treatment in subclinical HF. On the contrary, conflicting evide

## Document chunking

In [90]:
from agno.document.chunking.document import DocumentChunking

pdf_knowledge_base = PDFKnowledgeBase(
    path="heart_failure_review.pdf",
    vector_db=vector_db,
    reader=PDFReader(chunk=True),
    chunking_strategy=DocumentChunking()
)

print(pdf_knowledge_base.chunking_strategy)
chunks = list(pdf_knowledge_base.document_lists)[0]
print(len(chunks))
for chunk in chunks[:3]:
    print(f"{chunk.content[:100]}")
    print(f"Chunk metadata: {chunk.meta_data}")

<agno.document.chunking.document.DocumentChunking object at 0x11ee3f550>


19
Vol.:(0123456789)1 3
h
ttps://doi.org/10.1007/s10741-021-10105-w
Biomarkers for the diagnosis and ma
Chunk metadata: {'page': 1}
1 3
help refine the management of patients with HF and further 
improve their prognosis.
Characteris
Chunk metadata: {'page': 2}
1 3
Diagnosis
In the Breathing Not Properly study, which included 1586 
patients admitted to the eme
Chunk metadata: {'page': 3}


## Agentic chunking

In [84]:
from agno.document.chunking.agentic import AgenticChunking

pdf_knowledge_base = PDFKnowledgeBase(
    path="heart_failure_review.pdf",
    vector_db=vector_db,
    reader=PDFReader(chunk=True),
    chunking_strategy=AgenticChunking()
)

print(pdf_knowledge_base.chunking_strategy)
chunks = list(pdf_knowledge_base.document_lists)[0]
print(len(chunks))
for chunk in chunks[:3]:
    print(f"{chunk.content[:100]}")
    print(f"Chunk metadata: {chunk.meta_data}")

<agno.document.chunking.agentic.AgenticChunking object at 0x11ee359d0>


38
Vol.:(0123456789)1 3
h
ttps://doi.org/10.1007/s10741-021-10105-w
Biomarkers for the diagnosis and ma
Chunk metadata: {'page': 1}
1 3
help refine the management of patients with HF and further 
improve their prognosis.
Characteris
Chunk metadata: {'page': 2}
1 3
Diagnosis
In the Breathing Not Properly study, which included 1586 
patients admitted to the eme
Chunk metadata: {'page': 3}


# Retrieval

## Simple retrieval

In [96]:
agent = Agent(
    model=AzureOpenAI(id=model_name),
    knowledge=pdf_knowledge_base,
    search_knowledge=True,
)
agent.knowledge.load()

res = agent.run("List the comorbidities of heart failure.")
print(res.content)

The comorbidities of heart failure include a variety of systemic conditions that often coexist with the primary cardiac disorder. Based on the information retrieved from the knowledge base, the key comorbidities include:

1. Renal dysfunction and injury, such as elevated creatinine, decreased glomerular filtration rate (GFR), albuminuria, cystatin C, NGAL, KIM-1, NAG, FGF-23, β-trace protein, and β2-microglobulin.
2. Hepatic dysfunction, indicated by abnormal levels of AST, ALT, GGT, and bilirubin.
3. Hematological abnormalities, such as anemia and iron deficiency (ferritin, transferrin saturation).
4. Endocrine-metabolic changes, including alterations in thyroid hormones (fT3, fT4, TSH), cortisol, adiponectin, orexin, leptin, resistin, IGF-1, and growth hormone.
These comorbidities contribute to the complex pathophysiology and management considerations in patients with heart failure.


## Using instructions

In [127]:
agent = Agent(
    model=AzureOpenAI(id=model_name, temperature=0),
    knowledge=pdf_knowledge_base,
    instructions=[
        "Cite the exact phrases and section titles from the sources in your response.",
        "Use enumerations to organize your response.",
        "Do not write any other text than the response.",
    ],
    search_knowledge=True,
)
agent.knowledge.load()

res = agent.run("List the comorbidities of heart failure.")
print(res.content)  


The comorbidities of heart failure include both cardiovascular and non-cardiovascular conditions. According to the source "Leveraging Natural Learning Processing to Uncover Themes in Clinical Notes of Patients Admitted for Heart Failure," patients with heart failure can have "both cardiovascular as well as non-cardiovascular comorbidities."

From the "heart_failure_review" source, the comorbidities and related biomarkers or conditions associated with heart failure are detailed as follows:

1. Renal function and injury:
   - Creatinine/GFR
   - Plasma albumin, albuminuria
   - Urinary albumin/creatinine ratio
   - Cystatin C
   - NGAL (neutrophil gelatinase-associated lipocalin)
   - KIM-1 (kidney injury molecule-1)
   - NAG (N-acetyl-β-(D)-glucosaminidase)
   - FGF-23 (fibroblast growth factor-23)
   - β-trace protein
   - β2-microglobulin

2. Hepatic function:
   - AST (aspartate aminotransferase)
   - ALT (alanine aminotransferase)
   - GGT (gamma-glutamyl transferase)
   - Bilirubin

## Using structured outputs

In [125]:
from pydantic import BaseModel, Field

class Comorbidity(BaseModel):
    comorbidity: str = Field(description="Comorbidity")
    symptoms: list[str] = Field(description="Symptoms of the comorbidity")
    treatment: list[str] = Field(description="Treatment for the comorbidity")
    prevalence: float = Field(description="Prevalence of the comorbidity")

class ComorbiditiesList(BaseModel):
    comorbidities: list[Comorbidity] = Field(description="List of comorbidities")
    
agent = Agent(
    model=AzureOpenAI(id=model_name),
    knowledge=pdf_knowledge_base,
    instructions=[
        "Cite the exact phrases and section titles from the sources in your response.",
        "Use enumerations to organize your response.",
        "Do not write any other text than the response.",
    ],
    response_model=ComorbiditiesList,
    search_knowledge=True,
)
agent.knowledge.load()

res = agent.run("List the comorbidities of heart failure.")
res.content.comorbidities

[Comorbidity(comorbidity='Cardiovascular comorbidities', symptoms=['Heart not able to pump blood and oxygen effectively'], treatment=['Medications', 'Hospitalization'], prevalence=0.0),
 Comorbidity(comorbidity='Non-cardiovascular comorbidities', symptoms=['Various symptoms depending on specific comorbidity'], treatment=['Varies depending on comorbidity'], prevalence=0.0)]

## Using other Knowledge Sources

### ArXiv

In [115]:
from agno.knowledge.arxiv import ArxivKnowledgeBase

knowledge_base = ArxivKnowledgeBase(
    queries=["heart failure"],
    vector_db=vector_db,
)
knowledge_base.load()

agent = Agent(
    model=AzureOpenAI(id=model_name),
    knowledge=knowledge_base,
    search_knowledge=True,
)
agent.knowledge.load()

res = agent.run("What is the latest research on heart failure?")
print(res.content)

The latest research on heart failure encompasses various aspects including diagnostics, biomarkers, pathophysiology, and management strategies. Key highlights are as follows:

1. **Biomarkers in Heart Failure Management**: Circulating biomarkers such as natriuretic peptides (BNP, NT-proBNP), high-sensitivity troponins, and soluble ST2 are crucial for diagnosis, risk stratification, and potentially guiding therapy. Multi-marker and omics approaches (genomic, transcriptomic, proteomic, metabolomic) are being explored to refine patient management.

2. **Pathophysiology Insights**: Heart failure (HF) affects approximately 64 million people worldwide, with its prevalence rising due to aging populations and increasing comorbidities. HF is classified into HF with preserved ejection fraction (HFpEF), mid-range (HFmrEF), and reduced ejection fraction (HFrEF). Imbalances in neuroendocrine systems, especially overactivation of the sympathetic nervous system and renin-angiotensin-aldosterone syste

In [122]:
for doc in list(knowledge_base.document_lists)[0]:
    print(doc.name)
    print(doc.meta_data)
    print(doc.content[:100])
    print("-"*100)

Predicting Heart Failure with Attention Learning Techniques Utilizing Cardiovascular Data
{'pdf_url': 'http://arxiv.org/pdf/2407.08289v1', 'article_links': 'http://arxiv.org/abs/2407.08289v1, http://arxiv.org/pdf/2407.08289v1'}
Cardiovascular diseases (CVDs) encompass a group of disorders affecting the
heart and blood vessels,
----------------------------------------------------------------------------------------------------
Leveraging Natural Learning Processing to Uncover Themes in Clinical Notes of Patients Admitted for Heart Failure
{'pdf_url': 'http://arxiv.org/pdf/2204.07074v1', 'article_links': 'http://arxiv.org/abs/2204.07074v1, http://arxiv.org/pdf/2204.07074v1'}
Heart failure occurs when the heart is not able to pump blood and oxygen to
support other organs in 
----------------------------------------------------------------------------------------------------
Automated Identification of Drug-Drug Interactions in Pediatric Congestive Heart Failure Patients
{'pdf_url': 'http:

### Websites

In [128]:
url = "https://pmc.ncbi.nlm.nih.gov/articles/PMC1955040/"

from agno.knowledge.website import WebsiteKnowledgeBase

knowledge_base = WebsiteKnowledgeBase(
    url=url,
    vector_db=vector_db,
)
knowledge_base.load()

agent = Agent(
    model=AzureOpenAI(id=model_name),
    knowledge=knowledge_base,
    search_knowledge=True,
)
agent.knowledge.load()

res = agent.run("What are preventive measures for heart failure?")
print(res.content)

Preventive measures for heart failure focus on managing risk factors and optimizing treatments to prevent the development or progression of the condition. Key strategies include:

1. Screening and Early Detection: Using biomarkers such as B-type natriuretic peptide (BNP) and N-terminal pro-B-type natriuretic peptide (NT-proBNP) to screen individuals at risk can help in early diagnosis and intervention to prevent heart failure.

2. Managing Comorbidities: Control of underlying conditions like hypertension, diabetes, obesity, chronic kidney disease, and chronic obstructive pulmonary disease is crucial since these contribute to heart damage and heart failure.

3. Pharmacological Therapy: Use of medications that target neuroendocrine pathways such as beta-blockers, angiotensin-converting enzyme (ACE) inhibitors, angiotensin receptor blockers (ARBs), mineralocorticoid receptor antagonists, neprilysin inhibitors, and sodium-glucose co-transporter 2 inhibitors has been shown to improve outcom

### CSV files

In [138]:
heart_df = pd.read_csv("heart.csv")
heart_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [137]:
from agno.knowledge.csv import CSVKnowledgeBase

knowledge_base = CSVKnowledgeBase(
    path = "heart.csv",
    vector_db=vector_db,
)

agent = Agent(
    model=AzureOpenAI(id=model_name),
    knowledge=knowledge_base,
    search_knowledge=True,
)
agent.knowledge.load()
res = agent.run("What are the dataset columns?")
print(res.content)

The dataset columns include the following attributes related to heart patients:

- Age (numeric)
- Sex (categorical: M or F)
- Chest pain type (categorical: ASY, ATA, NAP, TA)
- Resting blood pressure (numeric)
- Serum cholesterol (numeric)
- Fasting blood sugar > 120 mg/dl (binary: 0 or 1)
- Resting electrocardiographic results (categorical: Normal, ST, LVH)
- Maximum heart rate achieved (numeric)
- Exercise induced angina (binary: Y or N)
- ST depression induced by exercise relative to rest (numeric)
- Slope of the peak exercise ST segment (categorical: Up, Flat, Down)
- Heart disease (label: 0 or 1)

These columns represent various clinical and demographic features used for heart disease classification or prediction. Let me know if you want additional details on each column or the dataset.


## Tips and Suggestions

* How does the chunking strategy affect the agent response?
* Which chunking strategy takes the longest?
* Try out other vector databases like Pgvector, MilvusDB, pinecone (may require Docker as setup)
* Design other structured output models (e.g. Biomarkers)
* Which other data sources can be used as Knowledge base?
* What does hybrid search mean? What other search types are there?