# GraphRAG test

### API Overview

This notebook provides a demonstration of how to interact with graphrag as a library using the API as opposed to the CLI. Note that graphrag's CLI actually connects to the library through this API for all operations.

In [None]:
import os
import yaml
import graphrag.api as api

from graphrag.index.typing import PipelineRunResult
from pathlib import Path
from graphrag.cli.initialize import initialize_project_at
from pipeline.api_handler import create_env_file

path = "./embedded_content/2ae2c8677b8a152a9c6ffff5beb7beda/" + "graphrag/"
# Create the folder if it does not exist
os.makedirs(path, exist_ok=True)

create_env_file(path)
try:
    # Call the function with the desired path
    initialize_project_at(Path(path))
except Exception as e:
    print(e)

settings = yaml.safe_load(open("./pipeline/graphrag_settings.yaml"))




Output()

In [None]:
from graphrag.config.create_graphrag_config import create_graphrag_config

graphrag_config = create_graphrag_config(
    values=settings, root_dir=path
)

In [None]:
import os
import fitz  # PyMuPDF

# Create output directory if it doesn't exist
os.makedirs(path+'input', exist_ok=True)

# Get all PDF files in the input_files directory
pdf_files = [f for f in os.listdir('./input_files') if f.endswith('.pdf')]

# Convert each PDF to a text file
for i, pdf_file in enumerate(pdf_files, start=1):
    pdf_path = os.path.join('./input_files', pdf_file)
    txt_path = os.path.join(path+'input', f'book {i}.txt')
    
    # Open the PDF file
    with fitz.open(pdf_path) as pdf_document:
        text = ""
        for page in pdf_document:
            text += page.get_text()
    
    # Save the text to a file
    with open(txt_path, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)

In [None]:
index_result: list[PipelineRunResult] = await api.build_index(config=graphrag_config)

# index_result is a list of workflows that make up the indexing pipeline that was run
for workflow_result in index_result:
    status = f"error\n{workflow_result.errors}" if workflow_result.errors else "success"
    print(f"Workflow Name: {workflow_result.workflow}\tStatus: {status}")

Output()

In [None]:
import pandas as pd

final_nodes = pd.read_parquet("./ragtest/output/create_final_nodes.parquet")
final_entities = pd.read_parquet(
    "./ragtest/output/create_final_entities.parquet"
)
final_communities = pd.read_parquet(
    "./ragtest/output/create_final_communities.parquet"
)
final_community_reports = pd.read_parquet(
    "./ragtest/output/create_final_community_reports.parquet"
)

response, context = await api.global_search(
    config=graphrag_config,
    nodes=final_nodes,
    entities=final_entities,
    communities=final_communities,
    community_reports=final_community_reports,
    community_level=10,
    dynamic_community_selection=False,
    response_type="Multiple Paragraphs",
    query="what does the term 'multiplexing' mean in this quantum network experiment?",
)

print(response)

creating llm client with {'api_key': 'REDACTED,len=32', 'type': "azure_openai_chat", 'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 1.0, 'n': 1, 'request_timeout': 180.0, 'api_base': 'https://knowhiz-service-openai.openai.azure.com/', 'api_version': '2024-02-15-preview', 'organization': None, 'proxy': None, 'audience': None, 'deployment_name': 'gpt-4o', 'model_supports_json': True, 'tokens_per_minute': 0, 'requests_per_minute': 0, 'max_retries': 10, 'max_retry_wait': 10.0, 'sleep_on_rate_limit_recommendation': True, 'concurrent_requests': 25}
### Understanding 'Multiplexing' in the Quantum Network Experiment

The term 'multiplexing' in the context of the quantum network experiment refers to the demonstration of a **temporally multiplexed ion-photon interface**. This involves the rapid transport of a chain of nine calcium ions, showcasing the university's capabilities in cutting-edge quantum research [Data: Reports (9)].

### Key Aspects of the Experiment

#### Temp

### [Global]

In [83]:
import os
from dotenv import load_dotenv
load_dotenv(".env")
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.typing import OpenaiApiType

from graphrag.query.indexer_adapters import (
    read_indexer_communities,
    read_indexer_entities,
    read_indexer_reports,
)

from graphrag.query.structured_search.global_search.community_context import (
    GlobalCommunityContext,
)

from graphrag.query.structured_search.global_search.search import GlobalSearch

# llm = get_llm('advance', para)

api_key = os.getenv("GRAPHRAG_API_KEY")
llm_model = os.getenv("GRAPHRAG_LLM_MODEL")
api_base = os.getenv("GRAPHRAG_API_BASE")
api_version = os.getenv("GRAPHRAG_API_VERSION")

llm = ChatOpenAI(
    api_key=api_key,
    api_base=api_base,
    api_version=api_version,
    model=llm_model,
    api_type=OpenaiApiType.AzureOpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.encoding_for_model(llm_model)

#### Load community reports as context for global search

- Load all community reports in the `create_final_community_reports` table from the GraphRAG, to be used as context data for global search.
- Load entities from the `create_final_nodes` and `create_final_entities` tables from the GraphRAG, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the rank attribute in the community reports table for context ranking)
- Load all communities in the `create_final_communites` table from the GraphRAG, to be used to reconstruct the community graph hierarchy for dynamic community selection.

In [84]:
# parquet files generated from indexing pipeline
INPUT_DIR = "./ragtest/output/"
COMMUNITY_TABLE = "create_final_communities"
COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"

# community level in the Leiden community hierarchy from which we will load the community reports
# higher value means we use reports from more fine-grained communities (at the cost of higher computation cost)
COMMUNITY_LEVEL = 2

community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

communities = read_indexer_communities(community_df, entity_df, report_df)
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

print(f"Total report count: {len(report_df)}")
print(
    f"Report count after filtering by community level {COMMUNITY_LEVEL}: {len(reports)}"
)

report_df.head()

Total report count: 11
Report count after filtering by community level 2: 9


Unnamed: 0,id,human_readable_id,community,level,title,summary,full_content,rank,rank_explanation,findings,full_content_json,period,size
0,763396c0-1c96-48cf-a090-def7b3666a86,6,6,1,Quantum Research Community: Q.W and Collaborators,The Quantum Research Community is centered aro...,# Quantum Research Community: Q.W and Collabor...,8.5,The impact severity rating is high due to the ...,[{'explanation': 'Q.W is a central figure in t...,"{\n ""title"": ""Quantum Research Community: Q...",2024-12-14,35.0
1,c5fd86a2-988d-40ff-bd60-5196fe0f7d48,7,7,1,Research Contributors at Berkeley,The community revolves around the city of Berk...,# Research Contributors at Berkeley\n\nThe com...,7.5,The impact severity rating is high due to the ...,[{'explanation': 'Berkeley is a central entity...,"{\n ""title"": ""Research Contributors at Berk...",2024-12-14,6.0
2,a9926229-092c-4676-81d2-e007d749d2d2,8,8,1,Lawrence Berkeley National Laboratory and Affi...,The community is centered around the Lawrence ...,# Lawrence Berkeley National Laboratory and Af...,8.5,The impact severity rating is high due to the ...,[{'explanation': 'Lawrence Berkeley National L...,"{\n ""title"": ""Lawrence Berkeley National La...",2024-12-14,5.0
3,3ed9a6a9-2478-44e1-b2ac-cdc8fa6ccd12,9,9,1,"University of California, Berkeley and Tempora...",The community centers around the University of...,"# University of California, Berkeley and Tempo...",8.5,The impact severity rating is high due to the ...,[{'explanation': 'The University of California...,"{\n ""title"": ""University of California, Ber...",2024-12-14,7.0
4,cfd10249-ee8e-4035-a52b-b7e99902d214,-1,-1,0,Quantum Network Research Community,The community is centered around a group of re...,# Quantum Network Research Community\n\nThe co...,8.5,The impact severity rating is high due to the ...,[{'explanation': 'The community includes sever...,"{\n ""title"": ""Quantum Network Research Comm...",,


In [85]:
context_builder = GlobalCommunityContext(
    community_reports=reports,
    communities=communities,
    entities=entities,  # default to None if you don't want to use community weights for ranking
    token_encoder=token_encoder,
)

In [86]:
context_builder_params = {
    "use_community_summary": False,  # False means using full community reports. True means using community short summaries.
    "shuffle_data": True,
    "include_community_rank": True,
    "min_community_rank": 0,
    "community_rank_name": "rank",
    "include_community_weight": True,
    "community_weight_name": "occurrence weight",
    "normalize_community_weight": True,
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    "context_name": "Reports",
}

map_llm_params = {
    "max_tokens": 1000,
    "temperature": 0.0,
    "response_format": {"type": "json_object"},
}

reduce_llm_params = {
    "max_tokens": 2000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500)
    "temperature": 0.0,
}

In [87]:
search_engine = GlobalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    max_data_tokens=12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    map_llm_params=map_llm_params,
    reduce_llm_params=reduce_llm_params,
    allow_general_knowledge=False,  # set this to True will add instruction to encourage the LLM to incorporate general knowledge in the response, which may increase hallucinations, but could be useful in some use cases.
    json_mode=True,  # set this to False if your LLM model does not support JSON mode.
    context_builder_params=context_builder_params,
    concurrent_coroutines=32,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [91]:
result = await search_engine.asearch(
    "How is multiplexing used in quantum network?"
)

print(result.response)

### Multiplexing in Quantum Networks

Multiplexing in quantum networks represents a significant technological advancement, particularly in the context of the temporally multiplexed ion-photon interface. This innovative interface involves the rapid transport of a chain of nine calcium ions, showcasing the capabilities in cutting-edge quantum research at the University of California, Berkeley, and the Lawrence Berkeley National Laboratory [Data: Reports (9, 12, 15)].

### Technological Advancements

The temporally multiplexed ion-photon interface is a breakthrough that allows for the efficient and rapid transport of quantum information. By using a chain of nine calcium ions, researchers can achieve high-speed data transmission and processing, which is crucial for the development of scalable quantum networks. This advancement highlights the potential for significant improvements in the speed and reliability of quantum communication systems.

### Collaborative Efforts

The demonstration of

In [92]:
# inspect the data used to build the context for the LLM responses
result.context_data["reports"]

Unnamed: 0,id,title,occurrence weight,content,rank
0,-1,Quantum Network Research Community,1.0,# Quantum Network Research Community\n\nThe co...,8.5
1,6,Quantum Research Community: Q.W and Collaborators,0.571429,# Quantum Research Community: Q.W and Collabor...,8.5
2,1,Quantum Networking and Key Researchers,0.428571,# Quantum Networking and Key Researchers\n\nTh...,8.5
3,2,Quantum Networking Research Community,0.428571,# Quantum Networking Research Community\n\nThe...,8.5
4,0,Quantum Information Science Community: Key Con...,0.428571,# Quantum Information Science Community: Key C...,8.5
5,4,Quantum Network of Clocks Research Community,0.285714,# Quantum Network of Clocks Research Community...,8.5
6,7,Research Contributors at Berkeley,0.285714,# Research Contributors at Berkeley\n\nThe com...,7.5
7,9,"University of California, Berkeley and Tempora...",0.142857,"# University of California, Berkeley and Tempo...",8.5
8,8,Lawrence Berkeley National Laboratory and Affi...,0.142857,# Lawrence Berkeley National Laboratory and Af...,8.5


In [93]:
# inspect number of LLM calls and tokens
print(
    f"LLM calls: {result.llm_calls}. Prompt tokens: {result.prompt_tokens}. Output tokens: {result.output_tokens}."
)

LLM calls: 2. Prompt tokens: 8332. Output tokens: 542.


### [Local]

In [94]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

In [98]:
# parquet files generated from indexing pipeline
INPUT_DIR = "./ragtest/output"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"
COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 457


Unnamed: 0,id,human_readable_id,title,community,level,degree,x,y
0,2fe265960def4a6ab3170469170b2e3d,0,BINGRAN YOU,5,0,2,0,0
1,741883dcdf4f4b79897b9badfd4d06a6,1,QIMING WU,5,0,2,0,0
2,64852639734847b6b64a19b40df52c22,2,DAVID MIRON,5,0,2,0,0
3,c9f484bb4200494c85c171b13f67d99d,3,WENJUN KE,5,0,1,0,0
4,493528c7977a46f6b0a34d5f2e1fec96,4,INDER MONGA,5,0,1,0,0


In [99]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 194


Unnamed: 0,id,human_readable_id,source,target,description,weight,combined_degree,text_unit_ids
0,2c2bd3b716f14f0182c52fa7aeb7d845,0,BINGRAN YOU,"UNIVERSITY OF CALIFORNIA, BERKELEY",Bingran You is affiliated with the University ...,9.0,11,[eff0b84128ab194a0f130b1e42161d03]
1,774b58624a5e46299f39b169a806d960,1,BINGRAN YOU,LAWRENCE BERKELEY NATIONAL LABORATORY,Bingran You is affiliated with the Lawrence Be...,9.0,11,[eff0b84128ab194a0f130b1e42161d03]
2,371f541012d2480bbb44920a10827fe6,2,QIMING WU,"UNIVERSITY OF CALIFORNIA, BERKELEY",Qiming Wu is affiliated with the University of...,9.0,11,[eff0b84128ab194a0f130b1e42161d03]
3,c50700aad88245dc923aa428b5c0ea93,3,QIMING WU,LAWRENCE BERKELEY NATIONAL LABORATORY,Qiming Wu is affiliated with the Lawrence Berk...,9.0,11,[eff0b84128ab194a0f130b1e42161d03]
4,dd060b313ed54cdc8fc18fcb185314a1,4,DAVID MIRON,"UNIVERSITY OF CALIFORNIA, BERKELEY",David Miron is affiliated with the University ...,9.0,11,[eff0b84128ab194a0f130b1e42161d03]


In [100]:
# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# Please see the GRAPHRAG_CLAIM_* settings
covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

claims = read_indexer_covariates(covariate_df)

print(f"Claim records: {len(claims)}")
covariates = {"claims": claims}

FileNotFoundError: [Errno 2] No such file or directory: './ragtest/output/create_final_covariates.parquet'

### CLI Overview

In [2]:
# !pip install graphrag

In [13]:
!mkdir -p ./ragtest/input

In [14]:
!curl https://www.gutenberg.org/cache/epub/24022/pg24022.txt -o ./ragtest/input/book.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  184k  100  184k    0     0   403k      0 --:--:-- --:--:-- --:--:--  404k


In [15]:
!graphrag init --root ./ragtest

[2KInitializing project at 
[35m/Users/bingranyou/Documents/GitHub_Mac_mini/DeepTutor/[0m[95mragtest[0m
⠋ GraphRAG Indexer 

In [16]:
!graphrag index --root ./ragtest


[2KLogging enabled at r 
[35m/Users/bingranyou/Documents/GitHub_Mac_mini/DeepTutor/ragtest/logs/[0m[95mindexing-e[0m
[95mngine.log[0m
[2K⠋ GraphRAG Indexer 
[2K[1A[2K⠋ GraphRAG Indexer e.text) - 1 files loaded (0 filtered) [90m━[0m [35m100%[0m [36m…[0m [33m0…[0m
├── Loading Input (InputFileType.text) - 1 files loaded (0 filtered) [90m━[0m [35m100%[0m [36m…[0m [33m0…[0m
[2K[1A[2K[1A[2K⠹ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 1 files loaded (0 filtered) [90m━[0m [35m100%[0m [36m…[0m [33m0…[0m
[2K[1A[2K[1A[2K⠇ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 1 files loaded (0 filtered) [90m━[0m [35m100%[0m [36m…[0m [33m0…[0m
[2K[1A[2K[1A[2K🚀 [32mcreate_base_text_units[0m
⠇ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 1 files loaded (0 filtered) [90m━[0m [35m100%[0m [36m…[0m [33m0…[0m
[2K[1A[2K[1A[2KEmpty DataFrame
Columns: [1m[[0m[1m][0m
Index: [1m[[0m[1m][0m
⠇ Gra

In [17]:
!graphrag query \
--root ./ragtest \
--method global \
--query "What are the top themes in this story?"



creating llm client with {'api_key': 'REDACTED,len=32', 'type': "azure_openai_chat", 'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 1.0, 'n': 1, 'request_timeout': 180.0, 'api_base': 'https://knowhiz-service-openai.openai.azure.com/', 'api_version': '2024-02-15-preview', 'organization': None, 'proxy': None, 'audience': None, 'deployment_name': 'gpt-4o', 'model_supports_json': True, 'tokens_per_minute': 0, 'requests_per_minute': 0, 'max_retries': 10, 'max_retry_wait': 10.0, 'sleep_on_rate_limit_recommendation': True, 'concurrent_requests': 25}

SUCCESS: Global Search Response:
### Top Themes in the Story

#### Transformation and Redemption
The central theme of the story is the transformation and redemption of Ebenezer Scrooge. Initially depicted as a miserly, uncharitable, and solitary man, Scrooge undergoes a significant transformation to become generous and kind-hearted. This change is catalyzed by his interactions with various supernatural entities, including t

# Neo4j GraphRAG test

In [19]:
from dotenv import load_dotenv
import os

# load neo4j credentials (and openai api key in background).
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

#uncomment this line if you aren't using a .env file
# os.environ['OPENAI_API_KEY'] = 'copy_paste_the_openai_key_here'

In [25]:
import neo4j
from neo4j_graphrag.llm import OpenAILLM
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv('.env')

# Set the OpenAI API key
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

driver = neo4j.GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

ex_llm=OpenAILLM(
    model_name="gpt-4o-mini",
    model_params={
        "response_format": {"type": "json_object"}, # use json_object formatting for best results
        "temperature": 0 # turning temperature down for more deterministic results
    }
)

#create text embedder
embedder = OpenAIEmbeddings()

In [26]:
#define node labels
basic_node_labels = ["Object", "Entity", "Group", "Person", "Organization", "Place"]

academic_node_labels = ["ArticleOrPaper", "PublicationOrJournal"]

medical_node_labels = ["Anatomy", "BiologicalProcess", "Cell", "CellularComponent", 
                       "CellType", "Condition", "Disease", "Drug",
                       "EffectOrPhenotype", "Exposure", "GeneOrProtein", "Molecule",
                       "MolecularFunction", "Pathway"]

node_labels = basic_node_labels + academic_node_labels + medical_node_labels

# define relationship types
rel_types = ["ACTIVATES", "AFFECTS", "ASSESSES", "ASSOCIATED_WITH", "AUTHORED",
    "BIOMARKER_FOR", "CAUSES", "CITES", "CONTRIBUTES_TO", "DESCRIBES", "EXPRESSES",
    "HAS_REACTION", "HAS_SYMPTOM", "INCLUDES", "INTERACTS_WITH", "PRESCRIBED",
    "PRODUCES", "RECEIVED", "RESULTS_IN", "TREATS", "USED_FOR"]

In [22]:
prompt_template = '''
You are a medical researcher tasks with extracting information from papers 
and structuring it in a property graph to inform further medical and research Q&A.

Extract the entities (nodes) and specify their type from the following Input text.
Also extract the relationships between these nodes. the relationship direction goes from the start node to the end node. 


Return result as JSON using the following format:
{{"nodes": [ {{"id": "0", "label": "the type of entity", "properties": {{"name": "name of entity" }} }}],
  "relationships": [{{"type": "TYPE_OF_RELATIONSHIP", "start_node_id": "0", "end_node_id": "1", "properties": {{"details": "Description of the relationship"}} }}] }}

- Use only the information from the Input text.  Do not add any additional information.  
- If the input text is empty, return empty Json. 
- Make sure to create as many nodes and relationships as needed to offer rich medical context for further research.
- An AI knowledge assistant must be able to read this graph and immediately understand the context to inform detailed research questions. 
- Multiple documents will be ingested from different sources and we are using this property graph to connect information, so make sure entity types are fairly general. 

Use only fhe following nodes and relationships (if provided):
{schema}

Assign a unique ID (string) to each node, and reuse it to define relationships.
Do respect the source and target node types for relationship and
the relationship direction.

Do not return any additional information other than the JSON in it.

Examples:
{examples}

Input text:

{text}
'''

In [24]:
from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import FixedSizeSplitter
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline

kg_builder_pdf = SimpleKGPipeline(
    llm=ex_llm,
    driver=driver,
    text_splitter=FixedSizeSplitter(chunk_size=500, chunk_overlap=100),
    embedder=embedder,
    entities=node_labels,
    relations=rel_types,
    prompt_template=prompt_template,
    from_pdf=True
)

In [None]:
pdf_file_paths = ['/Users/bingranyou/Documents/GitHub_Mac_mini/DeepTutor/input_files/test_1.pdf', 
             '/Users/bingranyou/Documents/GitHub_Mac_mini/DeepTutor/input_files/test.pdf']

for path in pdf_file_paths:
    print(f"Processing : {path}")
    pdf_result = await kg_builder_pdf.run_async(file_path=path)
    print(f"Result: {pdf_result}")

Processing : /Users/bingranyou/Documents/GitHub_Mac_mini/DeepTutor/test_inputs/test_1.pdf
Result: run_id='e3c69b8d-0315-4b93-ba6f-a0ee00874338' result={'resolver': {'number_of_nodes_to_resolve': 135, 'number_of_created_nodes': 123}}
Processing : /Users/bingranyou/Documents/GitHub_Mac_mini/DeepTutor/test_inputs/test.pdf
Result: run_id='bc8594ed-dd22-4bca-8b3c-d7e96a89526a' result={'resolver': {'number_of_nodes_to_resolve': 484, 'number_of_created_nodes': 426}}


## Knowledge Graph Retrieval
We will leverage Neo4j's vector search capabilities here. To do this, we need to begin by creating a vector index on the text chunks from the PDFs, which are stored on Chunk nodes in our knowledge graph.

In [27]:
from neo4j_graphrag.indexes import create_vector_index

create_vector_index(driver, name="text_embeddings", label="Chunk",
                    embedding_property="embedding", dimensions=1536, similarity_fn="cosine")

In [28]:
from neo4j_graphrag.retrievers import VectorRetriever

vector_retriever = VectorRetriever(
    driver,
    index_name="text_embeddings",
    embedder=embedder,
    return_properties=["text"],
)

In [29]:
import json

vector_res = vector_retriever.get_search_results(query_text = "What is quantum network?", 
                                                 top_k=5)
for i in vector_res.records: print("====\n" + json.dumps(i.data(), indent=4))

====
{
    "node": {
        "text": ").\n[6] J. P. Covey, H. Weinfurter, and H. Bernien, Quantum\nnetworks with neutral atom processing nodes, npj Quan-\ntum Information 9, 90 (2023).\n[7] P. Komar, E. M. Kessler, M. Bishof, L. Jiang, A. S.\nS\u00f8rensen, J. Ye, and M. D. Lukin, A quantum network\nof clocks, Nature Physics 10, 582 (2014).\n[8] B.Nichol, R.Srinivas, D.Nadlinger, P.Drmota, D.Main,\nG. Araneda, C. Ballance, and D. Lucas, An elementary\nquantum network of entangled optical atomic clocks, Na-\nture609, 689 (2022).\n[9] X. Guo, C. R. B"
    },
    "nodeLabels": [
        "__KGBuilder__",
        "Chunk"
    ],
    "elementId": "4:39b1bc3a-95cb-4e15-9a04-c279d2af3dad:302",
    "id": "4:39b1bc3a-95cb-4e15-9a04-c279d2af3dad:302",
    "score": 0.9178924560546875
}
====
{
    "node": {
        "text": "J. P. Covey,\nMultiplexed telecommunication-band quantum network-\ning with atom arrays in optical cavities, Physical Review\nResearch 3, 043154 (2021).\n[28] Y. Li and J. Thomps

In [30]:
from neo4j_graphrag.retrievers import VectorCypherRetriever

vc_retriever = VectorCypherRetriever(
    driver,
    index_name="text_embeddings",
    embedder=embedder,
    retrieval_query="""
//1) Go out 2-3 hops in the entity graph and get relationships
WITH node AS chunk
MATCH (chunk)<-[:FROM_CHUNK]-()-[relList:!FROM_CHUNK]-{1,2}()
UNWIND relList AS rel

//2) collect relationships and text chunks
WITH collect(DISTINCT chunk) AS chunks, 
  collect(DISTINCT rel) AS rels

//3) format and return context
RETURN '=== text ===\n' + apoc.text.join([c in chunks | c.text], '\n---\n') + '\n\n=== kg_rels ===\n' +
  apoc.text.join([r in rels | startNode(r).name + ' - ' + type(r) + '(' + coalesce(r.details, '') + ')' +  ' -> ' + endNode(r).name ], '\n---\n') AS info
"""
)