In [1]:
import neo4j
from neo4j_graphrag.llm import AzureOpenAILLM  
from neo4j_graphrag.embeddings.openai import AzureOpenAIEmbeddings 
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline
from neo4j_graphrag.retrievers import VectorRetriever
from neo4j_graphrag.generation.graphrag import GraphRAG 
from dotenv import load_dotenv
import os 


load_dotenv()

True

In [2]:
# connect to neo4j database 
neo4j_driver=neo4j.GraphDatabase.driver(os.getenv('NEO4J_URI_ONLINE'),auth=(os.getenv('NEO4J_USERNAME_ONLINE'),os.getenv('NEO4J_PASSWORD_ONLINE')))

In [3]:
llm=AzureOpenAILLM(
    model_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_MODEL_2'),
    azure_endpoint=os.getenv('AZURE_OpenAI_ENDPOINT_2'),
    api_version=os.getenv('AZURE_OpenAI_API_VERSION_2'),
    api_key=os.getenv('AZURE_OPENAI_API_KEY_2'),
    
)
llm.invoke('say something sarcastic')

LLMGenerationError: Error code: 401 - {'statusCode': 401, 'message': 'Unauthorized. Access token is missing, invalid, audience is incorrect (https://cognitiveservices.azure.com), or have expired.'}

In [4]:
# response in json format
response=llm.client.chat.completions.create(
    model='gpt-35-turbo',
    # response_format={'type':'json_object'},
    messages=[
        {'role':'system','content':'you are sarcastic batman desgined to gove response in JSON '},
        {'role':'user','content':'who is joker'}

    ],
)
print(response.choices[0].message.content)

AuthenticationError: Error code: 401 - {'statusCode': 401, 'message': 'Unauthorized. Access token is missing, invalid, audience is incorrect (https://cognitiveservices.azure.com), or have expired.'}

In [5]:
embeddings=AzureOpenAIEmbeddings(
    azure_endpoint=os.getenv('AZURE_OPENAI_EMBEDDINGS_ENDPOINT_2'),
    api_key=os.getenv('AZURE_OPENAI_API_KEY_2'),
    api_version=os.getenv('AZURE_OpenAI_API_VERSION_2')
)

In [6]:
# schema & prompt template 
# basic_node_labels = ["Object", "Entity", "Group", "Person", "Organization", "Place"]
nodes=['Person','Achievement','Organization','Event','Relationship']
# academic_node_labels = ["ArticleOrPaper", "PublicationOrJournal"]

# medical_node_labels = ["Anatomy", "BiologicalProcess", "Cell", "CellularComponent",
#                       "CellType", "Condition", "Disease", "Drug",
#                       "EffectOrPhenotype", "Exposure", "GeneOrProtein", "Molecule",
#                       "MolecularFunction", "Pathway"]

node_labels = nodes

# define relationship types
# rel_types = ["ACTIVATES", "AFFECTS", "ASSESSES", "ASSOCIATED_WITH", "AUTHORED",
#    "BIOMARKER_FOR", "CAUSES", "CITES", "CONSTRIBUTES_TO","DESCRIBES","EXPRESSES","HAS_REACTION","HAS_SYMPTOM","INCLUDES","INTERACTS_WITH",
#    "PRESCRIBED","PRODUCES","RECEIVED","RESULTS_IN","TREATS","USED_FOR"]
rel_types=['Collaborated with','Founded','won','achieved','contributed','studied','worked','invested','graduated']

In [7]:
# prompt template
prompt_template = '''
You are a network builder tasks with extracting information from papers 
and structuring it in a property graph to inform further networking Q&A.

Extract the entities (nodes) and specify their type from the following Input text.
Also extract the relationships between these nodes. the relationship direction goes from the start node to the end node. 


Return result as JSON using the following format:
{{"nodes": [ {{"id": "0", "label": "the type of entity", "properties": {{"name": "name of entity" }} }}],
  "relationships": [{{"type": "TYPE_OF_RELATIONSHIP", "start_node_id": "0", "end_node_id": "1", "properties": {{"details": "Description of the relationship"}} }}] }}

- Use only the information from the Input text. Do not add any additional information.  
- If the input text is empty, return empty Json. 
- Make sure to create as many nodes and relationships as needed to offer rich medical context for further research.
- An AI knowledge assistant must be able to read this graph and immediately understand the context to inform detailed research questions. 
- Multiple documents will be ingested from different sources and we are using this property graph to connect information, so make sure entity types are fairly general. 

Use only fhe following nodes and relationships (if provided):
{schema}

Assign a unique ID (string) to each node, and reuse it to define relationships.
Do respect the source and target node types for relationship and
the relationship direction.

Do not return any additional information other than the JSON in it.

Examples:
{examples}

Input text:

{text}
'''

# KG builder pipeline

In [8]:
# 1. Build KG and Store in Neo4j Database 
from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import FixedSizeSplitter
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline

kg_builder_pdf=SimpleKGPipeline(
    llm=llm,
    driver=neo4j_driver,
    # text_splitter=FixedSizeSplitter(chunk_size=1000,chunk_overlap=250),
    embedder=embeddings,
    entities=node_labels,
    relations=rel_types,
    prompt_template=prompt_template,
    from_pdf=False,
)


In [9]:
import os 

In [10]:
os.chdir('../')
%pwd 

'd:\\pythonProjects\\KAG_Testing'

In [11]:
path='input'
files=os.listdir(path)
text_data=''
# print(files)
for file in files:
    with open(os.path.join('input',file),'r') as f:
        text_data+=f.read()
print(type(text_data))


<class 'str'>


In [12]:
# path to files 
import time 
try:
    print(f'Processing: {path}')
    pdf_results=await kg_builder_pdf.run_async(text=text_data)
    print(f'Result: {pdf_results}')
except Exception as e:
    print(e)
    errordata=e.args[0] 
    if 'rate limit' in errordata:
        time.sleep(60)

    

Processing: input


LLM response has improper format for chunk_index=7
LLM response has improper format for chunk_index=32
LLM response has improper format for chunk_index=36
LLM response has improper format for chunk_index=33
LLM response is not valid JSON for chunk_index=63
LLM response has improper format for chunk_index=119
LLM response has improper format for chunk_index=109


neo4j_graphrag.experimental.components.types.Neo4jGraph() argument after ** must be a mapping, not list


LLM response has improper format for chunk_index=144
LLM response has improper format for chunk_index=160
LLM response has improper format for chunk_index=209
LLM response has improper format for chunk_index=228
LLM response has improper format for chunk_index=266
LLM response has improper format for chunk_index=276


In [13]:
print(pdf_results)

NameError: name 'pdf_results' is not defined

In [None]:
import time
import asyncio

async def process_pdf(text_data):
    try:
        print('Processing...')
        pdf_results = await kg_builder_pdf.run_async(text=text_data)
        print(f'Result: {pdf_results}')
        return pdf_results
    except Exception as e:
        print(f'Error: {e}')
        errordata = str(e)
        if 'rate limit' in errordata:
            print("Rate limit hit, retrying after 60 seconds...")
            time.sleep(60)
            return await process_pdf(text_data)  # Retry after delay
        return None

# Run async function correctly in Jupyter
# text_data = "Your text data here"  # Replace with actual data
await process_pdf(text_data)  # Directly await instead of using asyncio.run()


Processing...
Error: Connection error.


# retrieve data from knowledge graph
                

# Vector retriever

ANN (Approximate nearest neighbor)

In [17]:
from neo4j_graphrag.indexes import create_vector_index

create_vector_index(neo4j_driver, name='text_embeddings',label='Chunk', embedding_property='embedding',dimensions=1536, similarity_fn='cosine')



In [18]:
from neo4j_graphrag.retrievers import VectorRetriever

vector_retriever=VectorRetriever(
    neo4j_driver,
    index_name='text_embeddings',
    embedder=embeddings,
    return_properties=['text']
)

In [19]:
# run the retriever 
import json 
from neo4j_graphrag.generation.graphrag import GraphRAG

rag=GraphRAG(llm=llm,retriever=vector_retriever)

# vector_res=vector_retriever.get_search_results(query_text='give me details about the cat', top_k=3)

# for i in vector_res.records:
#     print(i)

response=rag.search('who is connected to the Amazon?')
print(response.answer)

Several countries in South America are connected to the Amazon River, including Brazil, Peru, Colombia, Venezuela, Ecuador, Bolivia, Guyana, and Suriname.


In [20]:
import json 

vector_res=vector_retriever.get_search_results(query_text='how is elon related to Jeff bezos', top_k=3)

for i in vector_res.records:
    print("====\n" + json.dumps(i.data(), indent=4))

In [21]:
# vector cyper retriever
from neo4j_graphrag.retrievers import VectorCypherRetriever

vc_retriever = VectorCypherRetriever(
   neo4j_driver,
   index_name="text_embeddings",
   embedder=embeddings,
   retrieval_query="""
//1) Go out 2-3 hops in the entity graph and get relationships
WITH node AS chunk
MATCH (chunk)<-[:FROM_CHUNK]-()-[relList:!FROM_CHUNK]-{1,2}()
UNWIND relList AS rel

//2) collect relationships and text chunks
WITH collect(DISTINCT chunk) AS chunks,
 collect(DISTINCT rel) AS rels

//3) format and return context
RETURN '=== text ===\n' + apoc.text.join([c in chunks | c.text], '\n---\n') + '\n\n=== kg_rels ===\n' +
 apoc.text.join([r in rels | startNode(r).name + ' - ' + type(r) + '(' + coalesce(r.details, '') + ')' +  ' -> ' + endNode(r).name ], '\n---\n') AS info
"""
)

In [None]:

from neo4j_graphrag.generation import RagTemplate


rag_template = RagTemplate(template='''Answer the Question using the following Context. Only respond with information mentioned in the Context. Do not inject any speculative information not mentioned.

# Question:
{query_text}

# Context:
{context}

# Answer:
''', expected_inputs=['query_text', 'context'])

v_rag  = GraphRAG(llm=llm, retriever=vector_retriever, prompt_template=rag_template)
vc_rag = GraphRAG(llm=llm, retriever=vc_retriever, prompt_template=rag_template)

In [None]:
q='who are some people elon musk is related to?'
print(f"Vector response: {v_rag.search(q,retriever_config={'top_k':10}).answer}")

Vector response: Elon Musk is related to Kimbal Musk, his brother, and Tosca Musk, his sister.


In [None]:
q='how is Ukaraine and spaceX related?'
print(f"Vector response: {vc_rag.search(q,retriever_config={'top_k':5}).answer}")

Vector response: The provided context does not contain any information regarding the relationship or connections between Ukraine and SpaceX.
