In [None]:
import os

# Put your own OpenAI Key here:
os.environ["OPENAI_API_KEY"] = "sk-xxxxxx"


In [3]:
# LlamaIndex Imports
from llama_index.core import SimpleDirectoryReader, KnowledgeGraphIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import ServiceContext
from llama_index.core import Settings

In [6]:
from llama_index.core import __version__ as core_version
print(core_version)

0.12.43


### Download the Data from Wikipidia

In [None]:
import os
import requests

# Make sure the folder exists
os.makedirs("./data", exist_ok=True)

# List of Wikipedia printable URLs and filenames
articles = [
    ("https://en.wikipedia.org/w/index.php?title=Ablepharon_macrostomia_syndrome&printable=yes", "ams.txt"),
    ("https://en.wikipedia.org/w/index.php?title=Spinal_muscular_atrophy&printable=yes", "sma.txt"),
    ("https://en.wikipedia.org/w/index.php?title=Retinoschisis&printable=yes", "retinoschisis.txt"),
    ("https://en.wikipedia.org/w/index.php?title=Monosomy_8p&printable=yes", "monosomy8p.txt")
]

# Loop and download
for url, filename in articles:
    response = requests.get(url)
    filepath = os.path.join("./data", filename)
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(response.text)
    print(f"âœ… Saved {filename}")

print("ðŸŽ‰ All articles downloaded!")


âœ… Saved ams.txt
âœ… Saved sma.txt
âœ… Saved retinoschisis.txt
âœ… Saved monosomy8p.txt
ðŸŽ‰ All articles downloaded!


### Load your .txt file

In [4]:
documents = SimpleDirectoryReader(input_dir="./data").load_data()

for doc in documents:
    print(f"Document length: {len(doc.text)} characters")


Document length: 121216 characters
Document length: 49541 characters
Document length: 128415 characters
Document length: 470007 characters


### Initialize LLM + ServiceContext

In [None]:
# Initialize LLM
llm = OpenAI(model="gpt-4o", temperature=0)
embed_model = OpenAIEmbedding(model="text-embedding-3-small")

# Build service context
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model
)

# Optional: set global context
Settings.llm = llm
Settings.embed_model = embed_model


### Build the Knowledge Graph Index

In [None]:
# Build the Knowledge Graph Index
kg_index = KnowledgeGraphIndex.from_documents(
    documents,
    service_context=service_context
)

# Success
print("âœ… Knowledge Graph built!")


### Turn into Query Engine

In [None]:
# Build the query engine
query_engine = kg_index.as_query_engine(similarity_top_k=10)


### Ask questions (GraphRAG in action)

In [None]:
# Example 1
question_1 = "What are the symptoms related to Retinoschisis?"
response_1 = query_engine.query(question_1)
print("Q:", question_1)
print("A:", response_1)

# Example 2
question_2 = "What causes Chromosome 8, Monosomy 8p?"
response_2 = query_engine.query(question_2)
print("Q:", question_2)
print("A:", response_2)


### View extracted Knowledge Graph

In [None]:
# You can access the graph object (NetworkX) via:
graph_nx = kg_index.get_networkx_graph()

# Example: list edges
print("Extracted Knowledge Graph edges:")
print(list(graph_nx.edges(data=True)))

# You can also visualize using networkx+matplotlib (if needed)


# llamaindex_graphrag_raredisease

In [7]:
import os
import requests

# Create data folder
os.makedirs("./data", exist_ok=True)

# Rare disease Wikipedia URLs
articles = [
    ("https://en.wikipedia.org/w/index.php?title=Ablepharon_macrostomia_syndrome&printable=yes", "ams.txt"),
    ("https://en.wikipedia.org/w/index.php?title=Spinal_muscular_atrophy&printable=yes", "sma.txt"),
    ("https://en.wikipedia.org/w/index.php?title=Retinoschisis&printable=yes", "retinoschisis.txt"),
    ("https://en.wikipedia.org/w/index.php?title=Monosomy_8p&printable=yes", "monosomy8p.txt"),
    ("https://en.wikipedia.org/w/index.php?title=DiGeorge_syndrome&printable=yes", "digeorge.txt"),
    ("https://en.wikipedia.org/w/index.php?title=Niemann%E2%80%93Pick_disease&printable=yes", "niemannpick.txt")
]

# Download loop
for url, filename in articles:
    response = requests.get(url)
    filepath = os.path.join("./data", filename)
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(response.text)
    print(f"âœ… Saved {filename}")

print("ðŸŽ‰ All articles downloaded!")

âœ… Saved ams.txt
âœ… Saved sma.txt
âœ… Saved retinoschisis.txt
âœ… Saved monosomy8p.txt
âœ… Saved digeorge.txt
âœ… Saved niemannpick.txt
ðŸŽ‰ All articles downloaded!


In [None]:
os.environ["OPENAI_API_KEY"] = ""

In [9]:
from llama_index.core import SimpleDirectoryReader, KnowledgeGraphIndex, ServiceContext, Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

In [10]:
documents = SimpleDirectoryReader(input_dir="./data").load_data()

print(f"Loaded {len(documents)} documents.")

# Show each doc length
for doc in documents:
    print(f"Doc length: {len(doc.text)} characters")

Loaded 6 documents.
Doc length: 121216 characters
Doc length: 344482 characters
Doc length: 49541 characters
Doc length: 198389 characters
Doc length: 128415 characters
Doc length: 470007 characters


#### LLM + ServiceContext

In [13]:
# Initialize LLM + Embedding
Settings.llm = OpenAI(model="gpt-4o", temperature=0)
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

# Build Knowledge Graph Index
kg_index = KnowledgeGraphIndex.from_documents(documents)

print("âœ… Knowledge Graph built!")


âœ… Knowledge Graph built!


#### KnowledgeGraphIndex

In [None]:
#kg_index = KnowledgeGraphIndex.from_documents(
    #documents,
    #service_context=service_context
#)

#print("âœ… Knowledge Graph built!")

In [33]:
# Access graph
graph_nx = kg_index.get_networkx_graph()

# Number of nodes and edges
print(f"Graph has {graph_nx.number_of_nodes()} nodes and {graph_nx.number_of_edges()} edges.")

# Show sample edges
sample_edges = list(graph_nx.edges(data=True))[:10]
for edge in sample_edges:
    print(edge)


Graph has 33 nodes and 31 edges.
('Ablepharon macrostomia syndrome', 'Rare genetic syndrome', {'label': 'Is', 'title': 'Is'})
('Ablepharon macrostomia syndrome', 'Autosomal dominant disorder', {'label': 'Is', 'title': 'Is'})
('Ablepharon macrostomia syndrome', 'Intersex variation', {'label': 'Is', 'title': 'Is'})
('Ablepharon macrostomia syndrome', 'Short description', {'label': 'Has', 'title': 'Has'})
('Ablepharon macrostomia syndrome', 'Creative commons', {'label': 'Is licensed under', 'title': 'Is licensed under'})
('Ablepharon macrostomia syndrome', 'Article', {'label': 'Is', 'title': 'Is'})
('Ablepharon macrostomia syndrome', 'Turkish', {'label': 'Has language', 'title': 'Has language'})
('Ablepharon macrostomia syndrome', 'English', {'label': 'Has language', 'title': 'Has language'})
('Ablepharon macrostomia syndrome', 'Talk', {'label': 'Has page', 'title': 'Has page'})
('Ablepharon macrostomia syndrome', 'Edit', {'label': 'Has action', 'title': 'Has action'})


In [34]:
query_engine = kg_index.as_query_engine(similarity_top_k=10)

In [35]:
# Example 1 
q1 = "What are the symptoms of Spinal Muscular Atrophy?"
r1 = query_engine.query(q1)
print("Q:", q1)
print("A:", r1)

# Example 2 
q2 = "What genetic abnormalities lead to symptoms in Monosomy 8p?"
r2 = query_engine.query(q2)
print("Q:", q2)
print("A:", r2)


Q: What are the symptoms of Spinal Muscular Atrophy?
A: The symptoms of Spinal Muscular Atrophy (SMA) include decreased impulse transmission through motor neurons, leading to decreased contractile activity and progressive atrophy of denervated muscles. Muscles of the lower extremities are usually affected first, followed by those of the upper extremities, spine, and neck. In more severe cases, pulmonary and mastication muscles may also be affected. Proximal muscles are typically affected earlier and more severely than distal muscles. The severity of symptoms is related to the number of copies of the SMN2 gene, with more copies generally resulting in milder symptoms.
Q: What genetic abnormalities lead to symptoms in Monosomy 8p?
A: The specific genetic abnormalities that lead to symptoms in Monosomy 8p are not detailed in the provided information. Monosomy 8p generally involves the deletion of a portion of the short arm of chromosome 8, which can result in various developmental and phys

#### Triple Demo

In [36]:
# Insert manual triple into graph
graph_nx.add_edge("DiGeorge Syndrome", "T-cell deficiency", relation="CAUSES")

# Verify
print("Manual triple inserted:")
print(("DiGeorge Syndrome", "CAUSES", "T-cell deficiency"))


Manual triple inserted:
('DiGeorge Syndrome', 'CAUSES', 'T-cell deficiency')


In [None]:
# Prompt (Entity / Relation Prompt)
# KG_TRIPLE_EXTRACTION_PROMPT.template = "Your new prompt here ..."

In [None]:
# Future: replace NetworkX with Neo4j backend
# from llama_index.graph_stores.neo4j import Neo4jGraphStore
# graph_store = Neo4jGraphStore(...)

#### Prompt

In [None]:
#from llama_index.core.indices.knowledge_graph import KG_TRIPLE_EXTRACTION_PROMPT

#KG_TRIPLE_EXTRACTION_PROMPT.template =
"""
Extract entities and relationships from the following text.
Output format: (subject, predicate, object)

Focus on:
- Rare diseases
- Genes
- Symptoms
- Causal relationships
- Treatments
- Multi-hop relations

Text:
{context_str}
"""

#### Neo4j backend demo

In [37]:
from llama_index.graph_stores.neo4j import Neo4jGraphStore

graph_store = Neo4jGraphStore(
    username="neo4j",
    password="llamaone",
    url="bolt://localhost:7687",
    database="neo4j"
)

kg_index_neo4j = KnowledgeGraphIndex.from_documents(
    documents,
    graph_store=graph_store
)

print("âœ… Neo4j Knowledge Graph built!")


âœ… Neo4j Knowledge Graph built!


In [38]:
print(f"Loaded {len(documents)} documents.")
for doc in documents:
    print(f"Length: {len(doc.text)}")

Loaded 6 documents.
Length: 121216
Length: 344482
Length: 49541
Length: 198389
Length: 128415
Length: 470007
