In [27]:
import os
from dotenv import load_dotenv

from src.agents.graph_qa import GraphAgentResponder
from src.config import LLMConf, EmbedderConf, KnowledgeGraphConfig, ChunkerConf, Source
from src.graph.graph_model import Ontology
from src.graph.knowledge_graph import KnowledgeGraph
from src.ingestion.local_ingestor import LocalIngestor
from src.ingestion.chunker import Chunker
from src.ingestion.cleaner import Cleaner
from src.ingestion.embedder import ChunkEmbedder
from src.ingestion.graph_miner import GraphMiner

env = load_dotenv('config.env')

In this notebook, we try to force the Relationship Extractor LLM to be more strict and adhere to the `Ontology` we pass to it

## Configuration

In [28]:
# EU press release ontology
ontology = Ontology(
    allowed_labels=[
        "Policy",
        "Project",
        "Commissioner",
        "Institution",
        "Country",
        "Region",
        "Organization",
        "Event",
        "Legislation",
        "Topic",
        "FundingProgram",
        "Decision",
        "Quote"
    ],
    labels_descriptions={
        "Policy": "A specific policy, strategy, or initiative introduced by the European Commission or another EU Institution.",
        "Project": "A specific project or initiative introduced by the European Commission or another EU Institution.",
        "Commissioner": "A member of the European Commission responsible for a particular policy area.",
        "Institution": "An institution of the European Union, such as the European Parliament or the Council of the EU.",
        "Country": "A member state of the European Union or a country referenced in a given Document.",
        "Region": "A specific geographic region mentioned in the Document.",
        "Organization": "An external entity, such as NGOs, companies, or research institutions mentioned in the Document.",
        "Event": "A conference, meeting, or summit relevant to EU affairs.",
        "Legislation": "A regulation, directive, or legal framework issued or proposed by the EU.",
        "Topic": "A general subject area covered by the Document, such as 'climate change' or 'digital economy'.",
        "FundingProgram": "An EU funding initiative or program, such as Horizon Europe or Erasmus+.",
        "Decision": "An official decision made by the European Commission or an EU body.",
        "Quote": "A direct quote from an EU official or other relevant stakeholder."
    },
    allowed_relations=[
        "ISSUED_BY",       
        "ANNOUNCES",        
        "QUOTE_FROM",      # Quote -> Commissioner / Organization / EUInstitution
        "PROPOSED_BY",     # Policy / Legislation -> Commissioner / EUInstitution
        "AFFECTS",         # Policy / Decision / Legislation -> Country / Region / Organization
        "FUNDED_BY",       # Policy / Project -> FundingProgram
        "ATTENDED_BY",     # Event -> Commissioner / Organization / Country
        "RELATED_TO"       # Any concept -> Any related concept
    ]
)

In [29]:
source = Source(folder='source_docs')

chunker_conf = ChunkerConf(
    type="recursive", 
    chunk_size=1000, 
    chunk_overlap=100
)

kg_config = KnowledgeGraphConfig(
    uri=os.getenv("NEO4J_URI"),
    user=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD"),
    index_name=os.getenv("INDEX_NAME"),
    ontology=ontology
)

embedder_conf = EmbedderConf(
    type="ollama",
    model="mxbai-embed-large",
)

re_model_conf=LLMConf(
    type=os.getenv("RE_MODEL_TYPE"),
    model=os.getenv("RE_MODEL_NAME"), 
    temperature=os.getenv("RE_MODEL_TEMPERATURE"), 
    deployment=os.getenv("RE_MODEL_DEPLOYMENT"),
    api_key=os.getenv("RE_API_KEY"),
    endpoint=os.getenv("RE_MODEL_ENDPOINT"),
    api_version=os.getenv("RE_MODEL_API_VERSION") or None
)

qa_model=LLMConf(
    type=os.getenv("QA_MODEL_TYPE"),
    model=os.getenv("QA_MODEL_NAME"), 
    temperature=os.getenv("QA_MODEL_TEMPERATURE"), 
    deployment=os.getenv("QA_MODEL_DEPLOYMENT"),
    api_key=os.getenv("QA_API_KEY"),
    endpoint=os.getenv("QA_MODEL_ENDPOINT"),
    api_version=os.getenv("QA_MODEL_API_VERSION") or None
)

In [30]:
ingestor = LocalIngestor(source=source)
cleaner = Cleaner()
chunker = Chunker(conf=chunker_conf)
embedder = ChunkEmbedder(conf=embedder_conf)

graph_miner = GraphMiner(
    conf=re_model_conf, 
    ontology=ontology
)

In [31]:
knowledge_graph = KnowledgeGraph(
    conf=kg_config, 
    embeddings_model=embedder.embeddings
)

knowledge_graph._driver.verify_connectivity()

knowledge_graph._driver.verify_authentication()

True

## Ingestion

In [32]:
docs = ingestor.batch_ingest()

In [33]:
docs = cleaner.clean_documents(docs)

In [34]:
docs = chunker.chunk_documents(docs)

for i in range(len(docs)):
    print(f"Number of chunks in doc {i}: {len(docs[i].chunks)}")

Number of chunks in doc 0: 10
Number of chunks in doc 1: 10
Number of chunks in doc 2: 2


In [35]:
docs = embedder.embed_documents_chunks(docs)

In [36]:
docs = graph_miner.mine_graph_from_docs(docs=docs)

In [37]:
knowledge_graph.add_documents(docs)

### Verify Ingestion

In [38]:
knowledge_graph.number_of_docs

3

In [39]:
knowledge_graph.number_of_labels

19

In [40]:
knowledge_graph.labels

['Chunk',
 'Document',
 'Person',
 'Country',
 'Region',
 '__Entity__',
 'Organization',
 'Policy',
 'Event',
 'Legislation',
 'Project',
 'Decision',
 'Institution',
 'Fundingprogram',
 'Topic',
 'Commissioner',
 'Quote',
 'Building',
 'Statement']

In [41]:
knowledge_graph.number_of_relationships

762

In [42]:
knowledge_graph.relationships

['MENTIONS',
 'PART_OF',
 'NEXT',
 'SUPPORTS',
 'RELATED_TO',
 'FUNDED_BY',
 'AFFECTS',
 'ATTENDED_BY',
 'ISSUED_BY',
 'PROPOSED_BY',
 'ANNOUNCES',
 'DISBURSED_TO',
 'SUBMITTED',
 'ADOPTED',
 'ALLOCATED_TO',
 'DISBURSES_TO',
 'REDUCES_LOAN_TO',
 'QUOTE_FROM',
 'SUBMITTED_TO',
 'ISSUED_TO',
 'PROPOSED_TO']

### Query the KnowledgeGraph

In [43]:
responder = GraphAgentResponder(
    qa_llm_conf=qa_model, # TODO try different LLMs
    cypher_llm_conf=qa_model,
    graph=knowledge_graph,
    rephrase_llm_conf=qa_model
)

In [44]:
query = "What countries are mentioned in the graph?"
responder.answer(query)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


'The countries mentioned are: Czechia, Germany, Italy, Portugal, Romania, Ukraine, and Russia.'

In [45]:
query = "What document mentions Ukraine?"
responder.answer(query)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


'The document that mentions Ukraine is the European Commission Press release, specifically "Commission disburses first €3 billion to Ukraine of its part of the G7 loan, to be repaid with proceeds from immobilised Russian assets".'

In [46]:
query = "Why are Italy and Portugal mentioned?"
responder.answer(query)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


'Italy and Portugal are mentioned because they have received payments under the Recovery and Resilience Facility (RRF) for their recovery and resilience plans. Italy has received a total of €122.2 billion out of €194.4 billion allocated, while Portugal has received a total of €11.39 billion out of €22.2 billion allocated. These payments are for various reforms and investments, including energy policy, economic governance, digitalization, and social policy.'

In [53]:
query = "What happened in the baltic sea?"
responder.answer(query)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


"There was an incident involving damaged undersea cables in the Baltic Sea, which is suspected to be a deliberate attack on critical infrastructure. The European Commission is working with Finnish authorities on the investigation and condemns any destruction of Europe's critical infrastructure. The incident is part of a series of suspected attacks, and the Commission is strengthening efforts to protect undersea cables, including enhanced information exchange, new detection technologies, and international cooperation."

In [48]:
query = "When was the incident in the baltic sea?"
responder.answer(query)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


'The incident in the Baltic Sea occurred on December 25, 2024.'

In [49]:
query = "What is the name of the document mentioning Italy?"
responder.answer(query)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


'The document mentioning Italy is the "European Commission Press release Next Generation EU reaches €300 billion in RRF payments with new disbursements to Czechia, Germany, Italy, Portugal, and Romania Brussels, 23 December 2024".'

In [50]:
query = "What document mentions  the Decarbonisation Of Public Transport?"
responder.answer(query)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


"The document that mentions the Decarbonisation Of Public Transport is related to Portugal's recovery and resilience plan, which includes investments in the decarbonisation of public transport."

In [51]:
query = "What is the name of the document that mentions the Decarbonisation Of Public Transport?"
responder.answer(query)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


'The document that mentions the Decarbonisation Of Public Transport is the Portuguese recovery and resilience plan.'

In [52]:
query = "How many chunks mentions Italy?"
responder.answer(query)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


'2 chunks mention Italy.'