# Neo4j GraphRAG test

https://github.com/neo4j-product-examples/graphrag-python-examples/blob/main/end-to-end-lupus.ipynb

In [1]:
from dotenv import load_dotenv
import os

# load neo4j credentials (and openai api key in background).
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

#uncomment this line if you aren't using a .env file
# os.environ['OPENAI_API_KEY'] = 'copy_paste_the_openai_key_here'

In [2]:
import openai
import neo4j
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
from neo4j_graphrag.llm import AzureOpenAILLM
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv('.env')

# from neo4j_graphrag.llm import OpenAILLM
# os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
# driver = neo4j.GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
# ex_llm=OpenAILLM(
#     model_name=os.getenv("GRAPHRAG_LLM_MODEL"),
#     model_params={
#         "response_format": {"type": "json_object"}, # use json_object formatting for best results
#         "temperature": 0 # turning temperature down for more deterministic results
#     }
# )

driver = neo4j.GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

ex_llm = AzureOpenAILLM(
    model_name=os.getenv("GRAPHRAG_LLM_MODEL"),
    azure_endpoint=os.getenv("GRAPHRAG_API_BASE"),  # update with your endpoint
    api_version=os.getenv("GRAPHRAG_API_VERSION"),  # update appropriate version
    api_key=os.getenv("GRAPHRAG_API_KEY"),  # api_key is optional and can also be set with OPENAI_API_KEY env var
)

#create text embedder
embedder = OpenAIEmbeddings()

In [3]:
#define node labels
basic_node_labels = ["Object", "Entity", "Group", "Person", "Organization", "Place"]

academic_node_labels = ["ArticleOrPaper", "PublicationOrJournal"]

medical_node_labels = ["Anatomy", "BiologicalProcess", "Cell", "CellularComponent", 
                       "CellType", "Condition", "Disease", "Drug",
                       "EffectOrPhenotype", "Exposure", "GeneOrProtein", "Molecule",
                       "MolecularFunction", "Pathway"]

node_labels = basic_node_labels + academic_node_labels + medical_node_labels

# define relationship types
rel_types = ["ACTIVATES", "AFFECTS", "ASSESSES", "ASSOCIATED_WITH", "AUTHORED",
    "BIOMARKER_FOR", "CAUSES", "CITES", "CONTRIBUTES_TO", "DESCRIBES", "EXPRESSES",
    "HAS_REACTION", "HAS_SYMPTOM", "INCLUDES", "INTERACTS_WITH", "PRESCRIBED",
    "PRODUCES", "RECEIVED", "RESULTS_IN", "TREATS", "USED_FOR"]

In [4]:
prompt_template = '''
You are a medical researcher tasks with extracting information from papers 
and structuring it in a property graph to inform further medical and research Q&A.

Extract the entities (nodes) and specify their type from the following Input text.
Also extract the relationships between these nodes. the relationship direction goes from the start node to the end node. 


Return result as JSON using the following format:
{{"nodes": [ {{"id": "0", "label": "the type of entity", "properties": {{"name": "name of entity" }} }}],
  "relationships": [{{"type": "TYPE_OF_RELATIONSHIP", "start_node_id": "0", "end_node_id": "1", "properties": {{"details": "Description of the relationship"}} }}] }}

- Use only the information from the Input text.  Do not add any additional information.  
- If the input text is empty, return empty Json. 
- Make sure to create as many nodes and relationships as needed to offer rich medical context for further research.
- An AI knowledge assistant must be able to read this graph and immediately understand the context to inform detailed research questions. 
- Multiple documents will be ingested from different sources and we are using this property graph to connect information, so make sure entity types are fairly general. 

Use only fhe following nodes and relationships (if provided):
{schema}

Assign a unique ID (string) to each node, and reuse it to define relationships.
Do respect the source and target node types for relationship and
the relationship direction.

Do not return any additional information other than the JSON in it.

Examples:
{examples}

Input text:

{text}
'''

In [5]:
from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import FixedSizeSplitter
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline

kg_builder_pdf = SimpleKGPipeline(
    llm=ex_llm,
    driver=driver,
    text_splitter=FixedSizeSplitter(chunk_size=500, chunk_overlap=100),
    embedder=embedder,
    entities=node_labels,
    relations=rel_types,
    prompt_template=prompt_template,
    from_pdf=True
)

In [6]:
pdf_file_paths = ['/Users/bingranyou/Documents/GitHub_Mac_mini/KnoWhizTutor/input_files/PRXQuantum.5.020308.pdf', 
             '/Users/bingranyou/Documents/GitHub_Mac_mini/KnoWhizTutor/input_files/Multiplexed_single_photon_source_arXiv__resubmit_.pdf']

for path in pdf_file_paths:
    print(f"Processing : {path}")
    pdf_result = await kg_builder_pdf.run_async(file_path=path)
    print(f"Result: {pdf_result}")

Processing : /Users/bingranyou/Documents/GitHub_Mac_mini/KnoWhizTutor/input_files/PRXQuantum.5.020308.pdf


LLM response is not valid JSON ```json
{"nodes": [], "relationships": []}
``` for chunk_index=6
LLM response is not valid JSON ```json
{}
``` for chunk_index=7
LLM response is not valid JSON ```json
{"nodes": [], "relationships": []}
``` for chunk_index=9
LLM response is not valid JSON {"nodes":[],"relationships":[]}` for chunk_index=12
LLM response is not valid JSON ```json
{
  "nodes": [],
  "relationships": []
}
``` for chunk_index=17
LLM response is not valid JSON ```json
{}
``` for chunk_index=18
LLM response is not valid JSON ```json
{}
``` for chunk_index=19
LLM response is not valid JSON ```json
{"nodes":[{"id":"0","label":"Place","properties":{"name":"Innsbruck"}},{"id":"1","label":"Place","properties":{"name":"Austria"}},{"id":"2","label":"ArticleOrPaper","properties":{"name":"A three-qubit quantum network node based on trapped atomic ions"}},{"id":"3","label":"Date","properties":{"name":"7 August 2023"}},{"id":"4","label":"Date","properties":{"name":"23 December 2023"}},{"id

Result: run_id='1a53f1f2-4831-4aeb-81b1-aac074e84f19' result={'resolver': {'number_of_nodes_to_resolve': 572, 'number_of_created_nodes': 533}}
Processing : /Users/bingranyou/Documents/GitHub_Mac_mini/KnoWhizTutor/input_files/Multiplexed_single_photon_source_arXiv__resubmit_.pdf


LLM response is not valid JSON ```json
{}
``` for chunk_index=2
LLM response is not valid JSON ```json
{"nodes":[],"relationships":[]}
``` for chunk_index=4
LLM response is not valid JSON ```json
{}
``` for chunk_index=9
LLM response is not valid JSON ```json
{
  "nodes": [],
  "relationships": []
}
``` for chunk_index=15
LLM response is not valid JSON ```json
{"nodes":[{"id":"0","label":"Object","properties":{"name":"single photons"}},{"id":"1","label":"Object","properties":{"name":"nine-ion chain"}},{"id":"2","label":"Object","properties":{"name":"single-ion addressing beam"}},{"id":"3","label":"Object","properties":{"name":"entanglement generation"}},{"id":"4","label":"Object","properties":{"name":"quantum repeater nodes"}},{"id":"5","label":"Object","properties":{"name":"photon trains"}},{"id":"6","label":"Entity","properties":{"name":"second-order time correlation"}},{"id":"7","label":"Entity","properties":{"name":"motional excitation"}}],"relationships":[{"type":"ACTIVATES","star

Result: run_id='5285e35b-bf19-4cdb-aea2-b45dd4d1f151' result={'resolver': {'number_of_nodes_to_resolve': 692, 'number_of_created_nodes': 563}}


## Knowledge Graph Retrieval
We will leverage Neo4j's vector search capabilities here. To do this, we need to begin by creating a vector index on the text chunks from the PDFs, which are stored on Chunk nodes in our knowledge graph.

In [7]:
from neo4j_graphrag.indexes import create_vector_index

create_vector_index(driver, name="text_embeddings", label="Chunk",
                    embedding_property="embedding", dimensions=1536, similarity_fn="cosine")

In [8]:
from neo4j_graphrag.retrievers import VectorRetriever

vector_retriever = VectorRetriever(
    driver,
    index_name="text_embeddings",
    embedder=embedder,
    return_properties=["text"],
)

In [9]:
import json

vector_res = vector_retriever.get_search_results(query_text = "What is quantum network?", 
                                                 top_k=5)
for i in vector_res.records: print("====\n" + json.dumps(i.data(), indent=4))

====
{
    "node": {
        "text": "lementary net-\nworks consisting of two [ 3\u201311] and three [ 12] remote\nmatter qubits, distributed over distances up to 1.5 km [ 13].\nRecently, two atoms 400 m apart have been entangled over\na spooled 33 km-long \ufb01ber channel [ 14].\nA key requirement for long-distance quantum network-\ning is the ability to entangle a matter qubit with a photon\nand to distribute that photon over many tens of kilometers.\nThat ability has been demonstrated using a range of di\ufb00er-\nent systems including trapped ions [ 1"
    },
    "nodeLabels": [
        "__KGBuilder__",
        "Chunk"
    ],
    "elementId": "4:af306d37-25ed-46d8-8e80-660b7ce1d0d3:448",
    "id": "4:af306d37-25ed-46d8-8e80-660b7ce1d0d3:448",
    "score": 0.922149658203125
}
====
{
    "node": {
        "text": "orks consist of matter-based\nnodes for information processing and storage, which are\ninterconnected with photonic links for the establishment\nof entanglement between th

In [10]:
from neo4j_graphrag.retrievers import VectorCypherRetriever

vc_retriever = VectorCypherRetriever(
    driver,
    index_name="text_embeddings",
    embedder=embedder,
    retrieval_query="""
//1) Go out 2-3 hops in the entity graph and get relationships
WITH node AS chunk
MATCH (chunk)<-[:FROM_CHUNK]-()-[relList:!FROM_CHUNK]-{1,2}()
UNWIND relList AS rel

//2) collect relationships and text chunks
WITH collect(DISTINCT chunk) AS chunks, 
  collect(DISTINCT rel) AS rels

//3) format and return context
RETURN '=== text ===\n' + apoc.text.join([c in chunks | c.text], '\n---\n') + '\n\n=== kg_rels ===\n' +
  apoc.text.join([r in rels | startNode(r).name + ' - ' + type(r) + '(' + coalesce(r.details, '') + ')' +  ' -> ' + endNode(r).name ], '\n---\n') AS info
"""
)

In [11]:
vc_res = vc_retriever.get_search_results(query_text = "How is precision medicine applied to Lupus?", top_k=3)

# print output
kg_rel_pos = vc_res.records[0]['info'].find('\n\n=== kg_rels ===\n')
print("# Text Chunk Context:")
print(vc_res.records[0]['info'][:kg_rel_pos])
print("# KG Context From Relationships:")
print(vc_res.records[0]['info'][kg_rel_pos:])

# Text Chunk Context:
=== text ===
301 (2009)
9. D. Porras, J.I. Cirac, Phys. Rev. Lett. 92, 207901 (2004)
10. H. Schmitz, A. Friedenauer, C. Schneider, R. Matjeschk, M. En-
derlein, T. Huber, J. Glueckert, D. Porras, T. Schaetz, Appl. Phys.
B, Lasers Opt. 95, 195 (2009)
11. M. Yu, V . Dokas, in Proceedings of 34th European Microwave
Conference , vol. 2 (2004), pp. 989–992
12. J.C. Collingwood, J.W. White, J. Sci. Instrum. 44, 509 (1967)
13. W. Meyer, IEEE Trans. Microw. Theory Tech. 29, 240 (1981)
14. W.W. Macalpine, R.O. Schild
# KG Context From Relationships:


=== kg_rels ===
D. Porras - AUTHORED() -> Phys. Rev. Lett. 92, 207901 (2004)
---
D. Porras - AUTHORED() -> Appl. Phys. B, Lasers Opt. 95, 195 (2009)
---
T. Schaetz - AUTHORED() -> Appl. Phys. B, Lasers Opt. 95, 195 (2009)
---
J. Glueckert - AUTHORED() -> Appl. Phys. B, Lasers Opt. 95, 195 (2009)
---
T. Huber - AUTHORED() -> Appl. Phys. B, Lasers Opt. 95, 195 (2009)
---
M. Enderlein - AUTHORED() -> Appl. Phys. B, Lasers Opt. 9

## GraphRAG

In [None]:
import openai
import neo4j
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings

from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv('.env')

driver = neo4j.GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

ex_llm = AzureOpenAILLM(
    model_name=os.getenv("GRAPHRAG_LLM_MODEL"),
    azure_endpoint=os.getenv("GRAPHRAG_API_BASE"),  # update with your endpoint
    api_version=os.getenv("GRAPHRAG_API_VERSION"),  # update appropriate version
    api_key=os.getenv("GRAPHRAG_API_KEY"),  # api_key is optional and can also be set with OPENAI_API_KEY env var
)

#create text embedder
embedder = OpenAIEmbeddings()

In [12]:
from neo4j_graphrag.llm import AzureOpenAILLM
from neo4j_graphrag.generation import RagTemplate
from neo4j_graphrag.generation.graphrag import GraphRAG

llm = AzureOpenAILLM(
    model_name=os.getenv("GRAPHRAG_LLM_MODEL"),
    azure_endpoint=os.getenv("GRAPHRAG_API_BASE"),  # update with your endpoint
    api_version=os.getenv("GRAPHRAG_API_VERSION"),  # update appropriate version
    api_key=os.getenv("GRAPHRAG_API_KEY"),  # api_key is optional and can also be set with OPENAI_API_KEY env var
)

rag_template = RagTemplate(template='''Answer the Question using the following Context. Only respond with information mentioned in the Context. Do not inject any speculative information not mentioned. 

# Question:
{query_text}
 
# Context:
{context}

# Answer:
''', expected_inputs=['query_text', 'context'])

v_rag  = GraphRAG(llm=llm, retriever=vector_retriever, prompt_template=rag_template)
vc_rag = GraphRAG(llm=llm, retriever=vc_retriever, prompt_template=rag_template)

In [14]:
q = "What is multiplexing."
print(f"Vector Response: \n{v_rag.search(q, retriever_config={'top_k':5}).answer}")
print("\n===========================\n")
print(f"Vector + Cypher Response: \n{vc_rag.search(q, retriever_config={'top_k':5}).answer}")

Vector Response: 
Multiplexing is the process of combining multiple signals into a single channel to increase the attempt rate. It has become a mature technology for ensemble-based quantum interfaces in various systems, such as atomic gases and rare-earth ion-doped solid-state systems.


Vector + Cypher Response: 
Multiplexing, in the context provided, refers to a process that includes a temporal multiplexing scheme based on the transport of an ion-chain, which aims to improve the rate of ion-photon entanglement over long distances. The multiplexing process also involves ion-transport through a specific spatial location to maximize photon coupling efficiency.


In [16]:
q = "What is quatnum network"

v_rag_result = v_rag.search(q, retriever_config={'top_k': 5}, return_context=True)
vc_rag_result = vc_rag.search(q, retriever_config={'top_k': 5}, return_context=True)

print(f"Vector Response: \n{v_rag_result.answer}")
print("\n===========================\n")
print(f"Vector + Cypher Response: \n{vc_rag_result.answer}")

Vector Response: 
A quantum network is a system that facilitates the entanglement and communication between quantum bits (qubits) over distances. In the context provided, two main results were discussed regarding quantum networks: the achievement of entanglement between a matter qubit and a photonic qubit over a distance of 101 kilometers, and the demonstration of a multimoding enhancement using three trapped matter qubits in a network node. A key requirement for long-distance quantum networking is the ability to entangle a matter qubit with a photon and distribute that photon over long distances, which has been shown using various systems, including trapped ions.


Vector + Cypher Response: 
A quantum network is a system that utilizes quantum bits (qubits) and entanglement to facilitate communication and information processing across multiple nodes. In the context provided, it involves achieving entanglement between a matter qubit and a photonic qubit over a significant distance, spec

In [17]:
for i in v_rag_result.retriever_result.items: print(json.dumps(eval(i.content), indent=1))

{
 "text": "ned for the most advanced forms of a quantum\nnetwork [ 2].\nIn this paper, we present two main results. First, entan-\nglement between a matter qubit and a photonic qubit is\nachieved over a spooled 101-km-long \ufb01ber channel: twice\nthe distance of previous works (see, e.g., Refs. [ 15\u201318])\nand requiring a matter-qubit coherence time on the order of\nthe photon travel time (494 \u00b5s) to achieve. Second, using\nthree cotrapped matter qubits in the node, we demonstrate\na multimoding enhancement for the rat"
}
{
 "text": "lementary net-\nworks consisting of two [ 3\u201311] and three [ 12] remote\nmatter qubits, distributed over distances up to 1.5 km [ 13].\nRecently, two atoms 400 m apart have been entangled over\na spooled 33 km-long \ufb01ber channel [ 14].\nA key requirement for long-distance quantum network-\ning is the ability to entangle a matter qubit with a photon\nand to distribute that photon over many tens of kilometers.\nThat ability has been demon

In [18]:
vc_ls = vc_rag_result.retriever_result.items[0].content.split('\\n---\\n')
for i in vc_ls:
    if "biomarker" in i: print(i)

In [19]:
vc_ls = vc_rag_result.retriever_result.items[0].content.split('\\n---\\n')
for i in vc_ls:
    if "treat" in i: print(i)

In [21]:
q = "What is quantum network benefit from multiplexing?"
print(f"Vector Response: \n{v_rag.search(q, retriever_config={'top_k': 5}).answer}")
print("\n===========================\n")
print(f"Vector + Cypher Response: \n{vc_rag.search(q, retriever_config={'top_k': 5}).answer}")

Vector Response: 
The benefit of multiplexing in a quantum network is that it allows for the combination of multiple signals into a single channel, thereby increasing the attempt rate. This is particularly useful in enhancing the modest entangling rates of existing long-distance quantum networking approaches.


Vector + Cypher Response: 
The context does not provide specific details about the benefits of multiplexing in quantum networks. However, it does mention multiplexed telecommunication-band quantum networking with atom arrays in optical cavities and multiplexed quantum repeaters based on dual-species trapped-ion systems. For the specific benefits of multiplexing in quantum networks, further information from additional sources would be required.
