In [1]:
import neo4j
from neo4j_graphrag.llm import AzureOpenAILLM  
from neo4j_graphrag.embeddings.openai import AzureOpenAIEmbeddings 
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline
from neo4j_graphrag.retrievers import VectorRetriever
from neo4j_graphrag.generation.graphrag import GraphRAG 
from dotenv import load_dotenv
import os 


load_dotenv()

True

In [2]:
# connect to neo4j database 
neo4j_driver=neo4j.GraphDatabase.driver(os.getenv('NEO4J_URI_ONLINE'),auth=(os.getenv('NEO4J_USERNAME_ONLINE'),os.getenv('NEO4J_PASSWORD_ONLINE')))

In [3]:
llm=AzureOpenAILLM(
    model_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_MODEL_2'),
    azure_endpoint=os.getenv('AZURE_OpenAI_ENDPOINT_2'),
    api_version=os.getenv('AZURE_OpenAI_API_VERSION_2'),
    api_key=os.getenv('AZURE_OPENAI_API_KEY_2'),
    
)
llm.invoke('say something sarcastic')

LLMResponse(content='Oh wow, I never knew that the sky was blue until you pointed it out to me. Thank you so much for enlightening me.')

# setup LLM

In [4]:
# response in json format
response=llm.client.chat.completions.create(
    model='gpt-4o',
    # response_format={'type':'json_object'},
    messages=[
        {'role':'system','content':'you are sarcastic batman desgined to gove response in JSON '},
        {'role':'user','content':'who is joker'}

    ],
)
print(response.choices[0].message.content)

{
  "response": "Oh, just your friendly neighborhood clown prince of crime, who enjoys causing chaos and mayhem in Gotham City. Nothing to worry about, right?"
}


In [5]:
embeddings=AzureOpenAIEmbeddings(
    azure_endpoint=os.getenv('AZURE_OPENAI_EMBEDDINGS_ENDPOINT_2'),
    api_key=os.getenv('AZURE_OPENAI_API_KEY_2'),
    api_version=os.getenv('AZURE_OpenAI_API_VERSION_2')
)

In [6]:
# Define basic node labels
basic_node_labels = ["Person", "Company", "Startup", "Investor", "VentureCapitalFirm", "FundingRound"]

# Additional business-related node labels
business_node_labels = ["Acquisition", "Merger", "Product", "Service", "Market", "Industry", "Technology"]

# Academic-related node labels (if applicable to research)
academic_node_labels = ["Publication", "Patent", "Conference", "ResearchPaper"]

# Funding and financial-related node labels
finance_node_labels = ["IPO", "Equity", "Grant", "DebtFinancing", "AngelInvestment"]

# Combine all node labels
node_labels = basic_node_labels + business_node_labels + academic_node_labels + finance_node_labels

# Define relationship types
rel_types = [
    "FOUNDED", "INVESTED_IN", "ACQUIRED", "MERGED_WITH", "PARTNERED_WITH", 
    "EMPLOYED_AT", "ADVISOR_TO", "BOARD_MEMBER_OF", "OWNS", "FUNDED",
    "LICENSED", "HOLDS_PATENT", "PUBLISHED", "COFOUNDED", "WORKS_WITH",
    "SPONSORED", "ACCELERATED_BY", "INCUBATED_BY", "RAISED_FUNDS_IN",
    "PRODUCT_LAUNCHED", "MARKETED_BY", "PIVOTED_TO", "DEVELOPED"
]

In [7]:
prompt_template = '''
You are a network builder tasked with extracting information from business-related documents 
and structuring it in a property graph to inform further networking and investment analysis.

Extract the entities (nodes) and specify their type from the following input text.
Also, extract the relationships between these nodes. The relationship direction goes from the start node to the end node.

Return the result as JSON using the following format:
{{
  "nodes": [ 
    {{"id": "entity_name", "label": "type_of_entity", "properties": {{"name": "name_of_entity"}} }}
  ],
  "relationships": [
    {{"type": "TYPE_OF_RELATIONSHIP", "start_node_id": "entity_name_1", "end_node_id": "entity_name_2", 
      "properties": {{"details": "Description of the relationship"}} 
    }} 
  ] 
}}

Guidelines:
- Use only the information from the input text. Do not add any additional information.  
- If the input text is empty, return an empty JSON. 
- Extract as many nodes and relationships as needed to offer a rich entrepreneurial and business context for further networking and investment analysis.
- The property graph should enable an AI knowledge assistant to understand the business context and assist in investment decisions, startup connections, and entrepreneurial insights.
- Multiple documents will be ingested from different sources, and we are using this property graph to connect information. Ensure entity types remain general and widely applicable.

Use only the following nodes and relationships (if provided):
{schema}

Assign a unique ID (string) to each node and reuse it to define relationships.
Ensure that relationships respect the source and target node types and follow the correct direction.

Return **only** the JSON in the specified format—no additional information.

Examples:
{examples}

Input text:

{text}
'''


# KG builder pipeline

In [8]:
# 1. Build KG and Store in Neo4j Database 
from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import FixedSizeSplitter
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline

kg_builder_pdf=SimpleKGPipeline(
    llm=llm,
    driver=neo4j_driver,
    text_splitter=FixedSizeSplitter(chunk_size=1000,chunk_overlap=250),
    embedder=embeddings,
    entities=node_labels,
    relations=rel_types,
    prompt_template=prompt_template,
    from_pdf=False,
    on_error='RAISE'
)


In [9]:
import os 

In [10]:
os.chdir('../')
%pwd 

'd:\\pythonProjects\\KAG_Testing'

In [41]:
import time
path='input'
files=os.listdir(path)

# print(files)

for file in files:
    with open(os.path.join('input',file),'r') as f:
        try:
            print(f'Processing: {file}')
            text_data=f.read()
            pdf_results=await kg_builder_pdf.run_async(text=text_data[:15000])
            print(f'Result: {pdf_results}')
        except Exception as e:
            # print(e)
            errordata=str(e.args[0])
            print(e.args[0])
            if 'rate limit' in errordata.lower():
                print('sleeping for a minute')
                time.sleep(60)


Processing: Bill_Gates.txt
Result: run_id='03fd5825-acd8-4224-9cdd-2893715cfaaf' result={'resolver': {'number_of_nodes_to_resolve': 157, 'number_of_created_nodes': 114}}
Processing: Elon_Musk.txt
Result: run_id='c9564555-a80f-4293-ae96-6a426c781913' result={'resolver': {'number_of_nodes_to_resolve': 280, 'number_of_created_nodes': 224}}
Processing: Jeff_Bezos.txt
Result: run_id='4db750ff-7bf7-40c0-bdbe-65511452c477' result={'resolver': {'number_of_nodes_to_resolve': 369, 'number_of_created_nodes': 312}}
Processing: Larry_Ellison.txt
Result: run_id='2df3bca0-8235-423a-80a2-074ec92002bf' result={'resolver': {'number_of_nodes_to_resolve': 462, 'number_of_created_nodes': 403}}
Processing: Larry_Page.txt
Result: run_id='f0606565-38dd-427d-8f57-8385ec934e7d' result={'resolver': {'number_of_nodes_to_resolve': 523, 'number_of_created_nodes': 482}}
Processing: Mark_Zuckerberg.txt
Result: run_id='1e26e128-bd06-4fb3-ab80-19938f404016' result={'resolver': {'number_of_nodes_to_resolve': 613, 'numbe

In [None]:
# pdf_files=['pdf_files/biomolecules.pdf']

# for path in pdf_files:
#     print(f'processing :{path}')
#     pdf_results=await kg_builder_pdf.run_async(file_path=path)
#     print(f'Results:{pdf_results}')

In [None]:
# # path to files 
# import time 
# try:
#     print(f'Processing: {path}')
#     pdf_results=await kg_builder_pdf.run_async(text=text_data)
#     print(f'Result: {pdf_results}')
# except Exception as e:
#     print(e)
#     errordata=str(e.args[0])
#     if 'rate limit' in errordata.lower():
#         time.sleep(60)

    

Processing: input
Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-08-01-preview have exceeded token rate limit of your current OpenAI S0 pricing tier. Please retry after 1 second. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit. For Free Account customers, upgrade to Pay as you Go here: https://aka.ms/429TrialUpgrade.'}}


In [13]:
print(pdf_results)

NameError: name 'pdf_results' is not defined

# retrieve data from knowledge graph
                

# Vector retriever

ANN (Approximate nearest neighbor)

In [22]:
from neo4j_graphrag.indexes import create_vector_index

create_vector_index(neo4j_driver, name='text_embeddings',label='Chunk', embedding_property='embedding',dimensions=1536, similarity_fn='cosine')



In [23]:
from neo4j_graphrag.retrievers import VectorRetriever

vector_retriever=VectorRetriever(
    neo4j_driver,
    index_name='text_embeddings',
    embedder=embeddings,
    return_properties=['text']
)

In [24]:
# run the retriever 
import json 
from neo4j_graphrag.generation.graphrag import GraphRAG

rag=GraphRAG(llm=llm,retriever=vector_retriever)

# vector_res=vector_retriever.get_search_results(query_text='give me details about the cat', top_k=3)

# for i in vector_res.records:
#     print(i)

response=rag.search('who is connected to the Amazon?')
print(response.answer)

Dario Amodei, CEO of Anthropic, and Sam Altman, CEO of OpenAI, are connected to Amazon as they were among the initial funders of OpenAI.


In [25]:
import json 

vector_res=vector_retriever.get_search_results(query_text='how is elon related to Jeff bezos', top_k=3)

for i in vector_res.records:
    print("====\n" + json.dumps(i.data()['node']['text'], indent=4))

====
"ationElon MuskFRSMusk in 2018BornElon Reeve Musk 19710628 June 28 1971 age53Pretoria Transvaal South AfricaCitizenshipSouth AfricaCanada from 1989United States from 2002EducationUniversity of Pennsylvania BA BSOccupationBusinessmanTitleFounder CEO and chief engineer of SpaceXCEO and product architect of Tesla IncOwner CTO and executive chairman of X formerly TwitterFounder of The Boring Company X Corp and xAICofounder of Neuralink OpenAI Zip2 and Xcom part of PayPalPresident of the Musk FoundationSpousesJustine Wilson m2000 div2008Talulah Riley m2010 div2012 m2013 div2016Children121ParentsErrol Musk fatherMaye Musk motherRelativesKimbal Musk brotherTosca Musk sisterLyndon Rive cousinAwardsFull listElon Musks voiceElon Musk speaking about India and his meeting with its prime minister Narendra Modi Recorded June 20 2023SignatureThis article is part of a series aboutElon MuskPersonalAwards and honorsViewsFamilyFilmographyLegalarticle is part of a series aboutElon MuskPersonalAwards 

In [36]:
from neo4j_graphrag.retrievers import VectorCypherRetriever

vc_retriever = VectorCypherRetriever(
   neo4j_driver,
   index_name="text_embeddings",
   embedder=embeddings,
   retrieval_query="""
//1) Go out 2-3 hops in the entity graph and get relationships
WITH node AS chunk
MATCH (chunk)<-[:FROM_CHUNK]-()-[relList:!FROM_CHUNK]-{1,2}()
UNWIND relList AS rel

//2) collect relationships and text chunks
WITH collect(DISTINCT chunk) AS chunks,
 collect(DISTINCT rel) AS rels

//3) format and return context
RETURN '=== text ===\n' + apoc.text.join([c in chunks | c.text], '\n---\n') + '\n\n=== kg_rels ===\n' +
 apoc.text.join([r in rels | startNode(r).name + ' - ' + type(r) + '(' + coalesce(r.details, '') + ')' +  ' -> ' + endNode(r).name ], '\n---\n') AS info
"""
)

In [37]:

from neo4j_graphrag.generation import RagTemplate


rag_template = RagTemplate(template='''Answer the Question using the following Context. Only respond with information mentioned in the Context. Do not inject any speculative information not mentioned.

# Question:
{query_text}

# Context:
{context}

# Answer:
''', expected_inputs=['query_text', 'context'])

v_rag  = GraphRAG(llm=llm, retriever=vector_retriever, prompt_template=rag_template)
vc_rag = GraphRAG(llm=llm, retriever=vc_retriever, prompt_template=rag_template)

In [38]:
q='what is BBC news and how is it connected in the given network?'
print(f"Vector response: {v_rag.search(q,retriever_config={'top_k':10}).answer}")

Vector response: The given context does not provide information about BBC news or its connection in the network.


In [39]:
q='what is BBC news and how is it connected in the given network?'
print(f"Vector response: {v_rag.search(q,retriever_config={'top_k':10}).answer}")

Vector response: There is no information provided in the context related to BBC News or its connection within the given network.


In [40]:
q='how is amazon related to Jeff Bezos?'
print(f"Vector response: {vc_rag.search(q,retriever_config={'top_k':5}).answer}")

LLMGenerationError: Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 16953 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

In [33]:
q='how is amazon related to Jeff Bezos? '
print(f"Vector response: {v_rag.search(q,retriever_config={'top_k':10}).answer}")

Vector response: Amazon is not related to Jeff Bezos in the provided context.


In [34]:
q='how elon related to Valdimir Putin?'
print(f"Vector response: {v_rag.search(q,retriever_config={'top_k':10}).answer}")

Vector response: Elon Musk traveled to Moscow, Russia in the same year he traveled to buy refurbished intercontinental ballistic missiles, but Musk was seen as a novice and the group returned to the United States without an agreement to purchase Russian launch services.


In [35]:
q='how elon related to Valdimir Putin?'
print(f"Vector response: {vc_rag.search(q,retriever_config={'top_k':10}).answer}")

LLMGenerationError: Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 20019 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

In [None]:
q='how is CNBC related in this network? give a detailed response'
print(f"Vector response: {vc_rag.search(q,retriever_config={'top_k':10}).answer}")



Vector response: The provided context does not contain any information about CNBC or its relationship to the network.


In [None]:
q='how is CNBC play a role in the given context'
print(f"Vector response: {vc_rag.search(q,retriever_config={'top_k':10}).answer}")



Vector response: The Context does not provide any information about CNBC or its role.


In [None]:
q='who is sam altman'
print(f"Vector response: {vc_rag.search(q,retriever_config={'top_k':10}).answer}")



Vector response: The Context does not provide any information about Sam Altman.


In [None]:
q='what is musk foundation'
print(f"Vector response: {vc_rag.search(q,retriever_config={'top_k':10}).answer}")



Vector response: The provided Context does not contain any information about the Musk Foundation.


# langgraph integration