In [1]:
from config import settings
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_community.graphs import Neo4jGraph
from langchain_openai import ChatOpenAI

In [24]:
llm = ChatOpenAI(
    model="gpt-4o",
    api_key=settings.OPENAI_API_KEY,
)

llm_transformer = LLMGraphTransformer(llm=llm)

In [4]:
graph = Neo4jGraph(
    url=settings.NEO4J_URI,
    password=settings.NEO4J_PASSWORD,
    username=settings.NEO4J_USERNAME
)

In [5]:
from yfiles_jupyter_graphs import GraphWidget
from neo4j import GraphDatabase

default_cypher = "MATCH (s)-[r]->(t) RETURN s, r, t LIMIT 100"

def showGraph(cypher: str = default_cypher):
    driver = GraphDatabase.driver(
        uri = settings.NEO4J_URI,
        auth = (settings.NEO4J_USERNAME,
                settings.NEO4J_PASSWORD)
    )
    session = driver.session()
    widget = GraphWidget(graph = session.run(cypher).graph())
    widget.node_label_mapping = 'id'
    #display(widget)
    return widget

In [6]:
# Load data into the graph database using the dataloader
from data_loader import load_data_into_database

load_data_into_database()

showGraph()

GraphWidget(layout=Layout(height='770px', width='100%'))

In [7]:
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    api_key=settings.OPENAI_API_KEY,
)

vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding",
    index_name = "document_embedding",
)

In [8]:
graph.query("""
  SHOW VECTOR INDEXES
  """
)

[{'id': 3,
  'name': 'document_embedding',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Document'],
  'properties': ['embedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0}]

##### Using GraphCypherQA


In [9]:
graph.refresh_schema()
graph.get_structured_schema

{'node_props': {'Speaker': [{'property': 'id', 'type': 'STRING'},
   {'property': 'biography', 'type': 'STRING'},
   {'property': 'name', 'type': 'STRING'}],
  'Submission': [{'property': 'state', 'type': 'STRING'},
   {'property': 'title', 'type': 'STRING'},
   {'property': 'location', 'type': 'STRING'},
   {'property': 'start_time', 'type': 'STRING'},
   {'property': 'duration', 'type': 'INTEGER'},
   {'property': 'end_time', 'type': 'STRING'},
   {'property': 'submission_type', 'type': 'STRING'},
   {'property': 'abstract', 'type': 'STRING'},
   {'property': 'date', 'type': 'STRING'},
   {'property': 'description', 'type': 'STRING'},
   {'property': 'id', 'type': 'STRING'}],
  'Document': [{'property': 'id', 'type': 'STRING'},
   {'property': 'text', 'type': 'STRING'},
   {'property': 'embedding', 'type': 'LIST'}]},
 'rel_props': {},
 'relationships': [{'start': 'Speaker',
   'type': 'PRESENTED',
   'end': 'Submission'},
  {'start': 'Submission', 'type': 'ON_DATE', 'end': 'Submissio

In [10]:
from langchain_community.chains.graph_qa.cypher import GraphCypherQAChain

cypher_chain = GraphCypherQAChain.from_llm(graph=graph, llm=llm, verbose=True)
response = cypher_chain.invoke({"query": "How many submissions are there in total? Please group this submission into distinct talks and tutorials?"})
response



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (s:Submission)
RETURN s.submission_type AS type, COUNT(s) AS count
[0m
Full Context:
[32;1m[1;3m[{'type': 'Talk', 'count': 49}, {'type': 'Tutorial', 'count': 15}][0m

[1m> Finished chain.[0m


{'query': 'How many submissions are there in total? Please group this submission into distinct talks and tutorials?',
 'result': 'There are a total of 64 submissions, with 49 being talks and 15 being tutorials.'}

In [11]:
from langchain.chains.retrieval_qa.base import RetrievalQA

qa_graph_chain = RetrievalQA.from_chain_type(
    llm, retriever=vector_index.as_retriever(), verbose = True
)

result = qa_graph_chain.invoke({"query": "Tell me more about Richard Ogunyale"})
result["result"]



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


'Richard Ogunyale is an experienced Senior Software Engineer who has extensive experience in developing and deploying AI applications. He is the speaker for the tutorial titled "Graph databases and Retrieval Augmented Generation" at the 2024 PyData Conference. The tutorial focuses on enhancing AI capabilities in text generation and comprehension by integrating graph databases with Retrieval-Augmented Generation (RAG) technology. Richard\'s expertise in AI applications and his background in software engineering make him well-suited to guide participants through the process of building AI systems that can leverage interconnected information for more accurate and context-rich responses.'

In [12]:
from langchain.agents import AgentExecutor, create_react_agent
from langchain.tools import Tool
from langchain import hub
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import Neo4jChatMessageHistory

tools = [
    Tool.from_function(
        name="Cypher Chain",
        description="For when you need to answer questions that can be answered by directly querying the database without having to use knowledge of vector embeddings. The question will be a string. Return a string.",
        func=cypher_chain.invoke,
    ),

    Tool.from_function(
        name="Natural QA Chain",
        description="For answering text based questions that are not easy or straight-forward to answer using Cypher queries but can be answered using vector embeddings. The question will be a String. Return a String",
        func=qa_graph_chain.invoke
    )
]


def get_memory(session_id):
    return Neo4jChatMessageHistory(session_id=session_id, graph=graph)


agent_prompt = hub.pull("hwchase17/react-chat")
agent = create_react_agent(llm, tools, agent_prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, handle_parsing_errors=True)

chat_agent = RunnableWithMessageHistory(
    agent_executor,
    get_memory,
    input_messages_key="input",
    history_messages_key="chat_history",
)

#### Load more data

In [13]:
from langchain_core.documents import Document
from data_loader import scrape_website, ScrapedWebsite, generate_md5_hash
from langchain.text_splitter import TokenTextSplitter
from tqdm import tqdm

list_of_websites = ["https://pydata.org/london2024", "https://pydata.org/london2024/about",
                    "https://pydata.org/london2024/job-board", "https://pydata.org/london2024/humble", "https://pydata.org/london2024/sponsor"]

scraped_data: list[ScrapedWebsite] = [scrape_website(url) for url in tqdm(list_of_websites, desc="Scraping website")]
list_of_documents: list[Document] = [
    Document(
        page_content=data.content,
        metadata={"title": data.title,
                  "url": data.url,
                  "id": generate_md5_hash(data.content)
                  })
    for data in tqdm(scraped_data, desc="Processing documents")]

text_splitter = TokenTextSplitter(chunk_size=1024, chunk_overlap=24)
documents = text_splitter.split_documents(list_of_documents)

print(documents)

Scraping website: 100%|██████████| 5/5 [00:09<00:00,  1.91s/it]
Processing documents: 100%|██████████| 5/5 [00:00<00:00, 4149.49it/s]


[Document(page_content='![Image 1](https://images.squarespace-cdn.com/content/v1/655270d4c839892af01aaf02/6a9c721d-5397-4349-a711-befa4be34f62/AdobeStock_132796360-dark.jpg)\n\nWHAT TO EXPECT\n--------------\n\nPyData London 2024 is a 3-day in-person event for the international community of data scientists, data engineers, and developers of data analysis tools to share ideas and learn from each other.\n\nDuring the conference, attendees will have the opportunity to attend live keynote sessions and talks, lightning talks, and get to know fellow members of the **Py****Data** Community.\n\n![Image 2](https://images.squarespace-cdn.com/content/v1/655270d4c839892af01aaf02/1699901673377-62BKP9F0DZ91L3D8PA26/promo+photo+with+overlay+1+-+small.png)\n\n![Image 3](https://images.squarespace-cdn.com/content/v1/655270d4c839892af01aaf02/d033c077-464f-4609-a990-4d841fe3b98f/magicpattern-mesh-gradient-1705596178730.png)\n\n**The event will be in-person at the** [**Leonardo Royal Hotel London Tower Br

In [14]:
graph_documents = llm_transformer.convert_to_graph_documents(documents)

print(f"Graph Documents: {graph_documents}")

Graph Documents: [GraphDocument(nodes=[Node(id='Pydata London 2024', type='Event'), Node(id='Data Scientists', type='Person'), Node(id='Data Engineers', type='Person'), Node(id='Developers Of Data Analysis Tools', type='Person'), Node(id='Leonardo Royal Hotel London Tower Bridge', type='Location'), Node(id='Dr. Rebecca Bilbro', type='Person'), Node(id='Yellowbrick', type='Software'), Node(id='Rotational Labs', type='Organization'), Node(id='University Of Illinois, Urbana-Champaign', type='Organization')], relationships=[Relationship(source=Node(id='Pydata London 2024', type='Event'), target=Node(id='Data Scientists', type='Person'), type='INCLUDES'), Relationship(source=Node(id='Pydata London 2024', type='Event'), target=Node(id='Data Engineers', type='Person'), type='INCLUDES'), Relationship(source=Node(id='Pydata London 2024', type='Event'), target=Node(id='Developers Of Data Analysis Tools', type='Person'), type='INCLUDES'), Relationship(source=Node(id='Pydata London 2024', type='Ev

In [15]:
import pickle

with open("graph_documents.pkl", "wb") as f:
    pickle.dump(graph_documents, f)

In [16]:
graph_documents

[GraphDocument(nodes=[Node(id='Pydata London 2024', type='Event'), Node(id='Data Scientists', type='Person'), Node(id='Data Engineers', type='Person'), Node(id='Developers Of Data Analysis Tools', type='Person'), Node(id='Leonardo Royal Hotel London Tower Bridge', type='Location'), Node(id='Dr. Rebecca Bilbro', type='Person'), Node(id='Yellowbrick', type='Software'), Node(id='Rotational Labs', type='Organization'), Node(id='University Of Illinois, Urbana-Champaign', type='Organization')], relationships=[Relationship(source=Node(id='Pydata London 2024', type='Event'), target=Node(id='Data Scientists', type='Person'), type='INCLUDES'), Relationship(source=Node(id='Pydata London 2024', type='Event'), target=Node(id='Data Engineers', type='Person'), type='INCLUDES'), Relationship(source=Node(id='Pydata London 2024', type='Event'), target=Node(id='Developers Of Data Analysis Tools', type='Person'), type='INCLUDES'), Relationship(source=Node(id='Pydata London 2024', type='Event'), target=Nod

In [17]:
from modified_neo4j_graph import ModifiedNeo4JGraph
graph = ModifiedNeo4JGraph(
    url=settings.NEO4J_URI,
    password=settings.NEO4J_PASSWORD,
    username=settings.NEO4J_USERNAME
)
graph.add_graph_documents_with_embeddings(
    graph_documents=graph_documents,
    baseEntityLabel=True,
    include_source=True,
    embedding=embeddings
)

In [27]:
vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding",
    index_name = "document_embedding",
    retrieval_query="""
    WITH node AS doc, score as similarity
    CALL {
        WITH doc
        MATCH (doc)-[*1..2]-(related)
        WITH doc, related, [key in keys(related) WHERE key <> 'embedding'] AS filtered_keys
        RETURN DISTINCT related, [key IN filtered_keys | key + ': ' + toString(related[key])] AS properties_list
    }
    WITH doc, similarity, properties_list
    RETURN coalesce(doc.text, '') + ' ' + coalesce(reduce(s = '', prop IN properties_list | s + ', ' + prop), '') AS text, similarity AS score, {source: doc.text} AS metadata
    LIMIT 25
    """
)

In [41]:
qa_graph_chain = RetrievalQA.from_chain_type(
    llm, retriever=vector_index.as_retriever(), verbose = True
)

result = qa_graph_chain.invoke({"query": "How many conferences are holding in the Minories?"})
result["result"]



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


'The provided context does not specify the number of conferences being held in the Minories. If you need that information, you may need to contact the event organizers directly at admin@pydata.org.'

In [33]:
tools = [
    Tool.from_function(
        name="Cypher Chain",
        description="For when you need to answer questions that can be answered by directly querying the database without having to use knowledge of vector embeddings. The question will be a string. Return a string.",
        func=cypher_chain.invoke,
    ),

    Tool.from_function(
        name="Natural QA Chain",
        description="For answering text based questions that are not easy or straight-forward to answer using Cypher queries but can be answered using vector embeddings. The question will be a String. Return a String",
        func=qa_graph_chain.invoke
    )
]


def get_memory(session_id):
    return Neo4jChatMessageHistory(session_id=session_id, graph=graph)


agent_prompt = hub.pull("hwchase17/react-chat")
agent = create_react_agent(llm, tools, agent_prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, handle_parsing_errors=True)

chat_agent = RunnableWithMessageHistory(
    agent_executor,
    get_memory,
    input_messages_key="input",
    history_messages_key="chat_history",
)

In [34]:
graph.refresh_schema()
graph.get_structured_schema

{'node_props': {'Speaker': [{'property': 'id', 'type': 'STRING'},
   {'property': 'biography', 'type': 'STRING'},
   {'property': 'name', 'type': 'STRING'}],
  'Submission': [{'property': 'state', 'type': 'STRING'},
   {'property': 'title', 'type': 'STRING'},
   {'property': 'location', 'type': 'STRING'},
   {'property': 'start_time', 'type': 'STRING'},
   {'property': 'duration', 'type': 'INTEGER'},
   {'property': 'end_time', 'type': 'STRING'},
   {'property': 'submission_type', 'type': 'STRING'},
   {'property': 'abstract', 'type': 'STRING'},
   {'property': 'date', 'type': 'STRING'},
   {'property': 'description', 'type': 'STRING'},
   {'property': 'id', 'type': 'STRING'}],
  'Document': [{'property': 'id', 'type': 'STRING'},
   {'property': 'text', 'type': 'STRING'},
   {'property': 'embedding', 'type': 'LIST'}],
  'Event': [{'property': 'id', 'type': 'STRING'}],
  'Person': [{'property': 'id', 'type': 'STRING'}],
  'Location': [{'property': 'id', 'type': 'STRING'}],
  'Software':

In [42]:
SESSION_ID = "6bdabff3-2bb2-4f23-a696-e10435dc111de"
print(f"Session ID: {SESSION_ID}")

# while True:
q = input("> ")
print(f"Question: {q}")

response = chat_agent.invoke(
    {
        "input": q
    },
    {"configurable": {"session_id": SESSION_ID}},
)

print(f"Result: {response['output']}")

Session ID: 6bdabff3-2bb2-4f23-a696-e10435dc111de
Question: How many conferences are holding in the Minories?


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (s:Submission)-[:ON_LOCATION]->(:Submission {location: "Minories"})
RETURN COUNT(s) AS conference_count
[0m
Full Context:
[32;1m[1;3m[{'conference_count': 210}][0m

[1m> Finished chain.[0m
Result: There are 210 conferences being held in the Minories.
