In [1]:
import os
from dotenv import load_dotenv
from pprint import pprint

from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
from langchain_neo4j.vectorstores.neo4j_vector import Neo4jVector
from langchain_neo4j.chains.graph_qa.cypher import GraphCypherQAChain
from langchain.prompts import PromptTemplate

from src.agents.llm import fetch_llm
from src.config import Source, ChunkerConf, LLMConf, EmbedderConf, KnowledgeGraphConfig
from src.graph.knowledge_graph import KnowledgeGraph
from src.ingestion.embedder import ChunkEmbedder

env = load_dotenv('config.env')

In [2]:
kg_config = KnowledgeGraphConfig(
    uri=os.getenv("NEO4J_URI"),
    user=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD"),
    index_name='vector'
)

llm_conf = LLMConf(
    type="ollama",
    model="llama3.2:latest", 
    temperature=0.0, 
)

embedder_conf = EmbedderConf(
    type="ollama",
    model="mxbai-embed-large",
)

In [3]:
embedder = ChunkEmbedder(conf=embedder_conf)

knowledge_graph = KnowledgeGraph(
    conf=kg_config, 
    embeddings_model=embedder.embeddings
)

knowledge_graph._driver.verify_connectivity()

knowledge_graph._driver.verify_authentication()

True

### Basic Similarity Search

In [None]:
query = "Which document mentions €300 billion in RRF payments?"
knowledge_graph.vector_store.similarity_search(query=query)

### Basic RAG

In [8]:
def get_basic_rag_prompt() -> PromptTemplate:

    prompt = """
        You are an assistant that helps to form nice and human understandable answers.
        The information part contains the provided information that you must use to construct an answer.
        The provided information is authoritative, you must never doubt it or try to use your internal knowledge to correct it.
        Make the answer sound as a response to the question. Do not mention that you based the result on the given information.

        CONTEXT: {context}
        QUESTION: {question}
    """

    template = PromptTemplate.from_template(prompt)

    template.input_variables = ['context', 'question']

    return template


In [9]:
query = "Which Countries have received RRF payments?"

In [10]:
basic_prompt = get_basic_rag_prompt()

retriever = knowledge_graph.vector_store.as_retriever(
    search_type="similarity", 
    search_kwargs={"k":5}
)

llm = fetch_llm(conf=llm_conf)

response = llm.invoke(
    input=basic_prompt.format(context=retriever.invoke(query), question=query), 
)

In [11]:
response.content

'The countries that have received Recovery and Resilience Facility (RRF) payments are Czechia, Germany, Italy, Portugal, and Romania.'

### Cypher Generation

In [12]:
from langchain_neo4j import Neo4jGraph, GraphCypherQAChain

In [13]:
graph_chain = GraphCypherQAChain.from_llm(
    llm=llm, 
    graph=knowledge_graph, 
    verbose=True,
    allow_dangerous_requests=True,
    validate_cypher=True
)

In [14]:
graph_chain.invoke("What countries are mentioned in the graph?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (c:Country) RETURN c.name[0m
Full Context:
[32;1m[1;3m[{'c.name': 'Germany'}, {'c.name': 'Italy'}, {'c.name': 'Portugal'}, {'c.name': 'Romania'}, {'c.name': 'Czechia'}][0m

[1m> Finished chain.[0m


{'query': 'What countries are mentioned in the graph?',
 'result': 'Germany, Italy, Portugal, Romania, Czechia.'}

In [15]:
graph_chain.invoke("What amount of money was disboursed to Romania?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


{'query': 'What amount of money was disboursed to Romania?',
 'result': "I don't know the answer."}

### Rephrase Prompt
add an agent that rephrases the user question given the graph schema

In [4]:
def get_rephrase_prompt() -> PromptTemplate:

    prompt = """
        Your task is to rephrase a user's question based on the schema of a graph database that will be given to you. 
        
        Do not mention anything else, just rephrase the question from the user to be as ccherent as possible with the schema of the graph.
        Do not make things up or add any information on your own. 

        SCHEMA: {schema}
        QUESTION: {question}

        REPHRASED_QUESTION: 
    """

    template = PromptTemplate.from_template(prompt)

    template.input_variables = ['schema', 'question']

    return template

In [45]:
query = "When was the request for payment from Germany submitted?"

rephrase_prompt = get_rephrase_prompt()

rephrase_prompt.partial_variables = {"schema": knowledge_graph.get_structured_schema}

rephrased_question = llm.invoke(input=rephrase_prompt.format(question=query)).content

rephrased_question

'"Which recovery plan allocation in Country with id \'Germany\' has a reform and which organization is associated with that person who made the contact with that country?"'

In [17]:
graph_chain.invoke(rephrased_question)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


{'query': '\'What is the amount of money that was disbursed to a country with an id equal to "Romania"\'',
 'result': "I don't know the answer."}

### Combined Approach
We want to try what happens if we give to a third agent the task to summarize results from both a Cypher Query (if any result) AND a Vector Search. 

In [38]:
def get_rephrase_prompt() -> PromptTemplate:

    prompt = """
        Your task is to rephrase a user's question based on the schema of a Graph Database that will be given to you. 

        Do not mention anything else, just rephrase the question from the user to be as ccherent as possible with the schema of the graph.
        Do not make things up or add any information on your own. 

        SCHEMA: {schema}
        QUESTION: {question}

        REPHRASED_QUESTION: 
    """

    template = PromptTemplate.from_template(prompt)

    template.input_variables = ['schema', 'question']

    return template

In [34]:
def get_summarization_prompt() -> PromptTemplate:

    prompt = """
        Your task is to synthetize a clear and helpful answer to a question.

        The information to use for the task come from a Vector Database and from a Graph Database.
        
        In your task, you MUST use either the context obtained from a vector search on the Vector Database 
        and the query results given running a Cypher Query on the Graph Database. I

        Do not mention anything else, just summarize an precise, clear and helpful answer. 
        Do not make things up or add any information on your own. 

        QUESTION: {question}

        RETRIEVED CONTEXT: {retrieved_context}

        QUERY RESULT ON GRAPH: {query_result}

        ANSWER: 
    """

    template = PromptTemplate.from_template(prompt)

    template.input_variables = ['question', 'retrieved_context', 'query_result']

    return template
    

In [39]:
summarization_prompt = get_summarization_prompt()

rephrase_prompt = get_rephrase_prompt()

graph_chain = GraphCypherQAChain.from_llm(
    llm=llm, 
    graph=knowledge_graph, 
    verbose=True,
    allow_dangerous_requests=True,
    validate_cypher=True, 
    return_intermediate_steps=True
)

In [49]:
query = "How much money was received by Portugal and why?"

context_docs = knowledge_graph.vector_store.similarity_search(query=query)

context = ""

for doc in context_docs:
    context += f"\n {doc.page_content}"

# rephrase_prompt.partial_variables = {"schema": knowledge_graph.get_structured_schema}

# rephrased_question = llm.invoke(input=rephrase_prompt.format(question=query)).content

knowledge_graph._driver.verify_connectivity()

try:
    graph_qa_output = graph_chain.invoke(query)
except Exception as e:
    graph_qa_output = None

final_answer = llm.invoke(
    input=summarization_prompt.format(
        question=query, 
        retrieved_context=context, 
        query_result=graph_qa_output['intermediate_steps'] if graph_qa_output else {}
    )
)

final_answer.content



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Country {name: "Portugal"})
OPTIONAL MATCH (p)-[:RECEIVES_FUNDING_FROM]->(c:Country)
OPTIONAL MATCH (c)-[:PAYMENT]->()
RETURN c.amount, p.name[0m
Full Context:
[32;1m[1;3m[{'c.amount': None, 'p.name': 'Portugal'}][0m

[1m> Finished chain.[0m


"Portugal received €1.65 billion in grants and €1.25 billion in loans from the European Commission as part of its fifth payment request, covering 42 milestones and targets. This amount is part of the €22.2 billion allocated to Portugal's recovery and resilience plan."