In [1]:
import os
from dotenv import load_dotenv
from pprint import pprint

from src.config import Source, ChunkerConf, LLMConf, EmbedderConf, KnowledgeGraphConfig

from src.graph.knowledge_graph import KnowledgeGraph

from src.ingestion.local_ingestor import LocalIngestor
from src.ingestion.chunker import Chunker
from src.ingestion.cleaner import Cleaner
from src.ingestion.embedder import ChunkEmbedder
from src.ingestion.graph_miner import GraphMiner

env = load_dotenv('config.env')

### Configuration


In [2]:
source = Source(folder='source_folder')

kg_config = KnowledgeGraphConfig(
    uri=os.getenv("NEO4J_URI"),
    user=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD"),
    index_name="vectors"
)

chunker_conf = ChunkerConf(
    type="recursive", 
    chunk_size=1000, 
    chunk_overlap=100
)

llm_conf = LLMConf(
    type="ollama",
    model="llama3.2:latest", 
    temperature=0.0, 
)

embedder_conf = EmbedderConf(
    type="ollama",
    model="mxbai-embed-large",
)

### Press Releases
For demonstration purposes, here we ingest into the knowledge graph a couple of EU Commission Press Release in `.pdf` format

### Load Documents from local folder

In [3]:
ingestor = LocalIngestor(source=source)

docs = ingestor.batch_ingest()

docs

[ProcessedDocument(filename='source_folder/NextGenerationEU_reaches__300_billion_in_RRF_payments_with_new_disbursements_to_Czechia__Germany__Italy__Portugal__and_Romania_.pdf', source="European Commission - Press release\nNextGenerationEU reaches €300 billion in RRF payments with new\ndisbursements to Czechia, Germany, Italy, Portugal, and Romania\nBrussels, 23 December 2024\nToday, the Commission disbursed payments under the Recovery and Resilience Facility (RRF) to\nCzechia, Germany, Italy, Portugal and Romania, totalling €26.8 billion in loans and grants. The total\ndisbursed under the RRF has now reached over €300 billion. This important milestone reflect the scale\nof the transformative reforms and investments being carried out across EU Member States,\naccelerating the green and digital transitions while strengthening the Union's overall resilience.\nCzechia receives its third payment for €1.7 billion\nThe Commission disbursed €1.5 billion in grants and €200 million in loans, net

In [4]:
print(docs[0].source)

European Commission - Press release
NextGenerationEU reaches €300 billion in RRF payments with new
disbursements to Czechia, Germany, Italy, Portugal, and Romania
Brussels, 23 December 2024
Today, the Commission disbursed payments under the Recovery and Resilience Facility (RRF) to
Czechia, Germany, Italy, Portugal and Romania, totalling €26.8 billion in loans and grants. The total
disbursed under the RRF has now reached over €300 billion. This important milestone reflect the scale
of the transformative reforms and investments being carried out across EU Member States,
accelerating the green and digital transitions while strengthening the Union's overall resilience.
Czechia receives its third payment for €1.7 billion
The Commission disbursed €1.5 billion in grants and €200 million in loans, net of pre-financing, to
Czechia.
On 16 September 2024, Czechia submitted its third request for payment, covering 65 milestones and
targets. These include reforms, such as the amendments to the Ener

### Clean 

In [5]:
cleaner = Cleaner()

docs = cleaner.clean_documents(docs)

pprint(docs[0].source)

('European Commission Press release Next Generation EU reaches €300 billion in '
 'RRF payments with new disbursements to Czechia, Germany, Italy, Portugal, '
 'and Romania Brussels, 23 December 2024 Today, the Commission disbursed '
 'payments under the Recovery and Resilience Facility (RRF) to Czechia, '
 'Germany, Italy, Portugal and Romania, totalling €26.8 billion in loans and '
 'grants. The total disbursed under the RRF has now reached over €300 billion. '
 'This important milestone reflect the scale of the transformative reforms and '
 'investments being carried out across EU Member States, accelerating the '
 "green and digital transitions while strengthening the Union's overall "
 'resilience. Czechia receives its third payment for €1.7 billion The '
 'Commission disbursed €1.5 billion in grants and €200 million in loans, net '
 'of pre-financing, to Czechia. On 16 September 2024, Czechia submitted its '
 'third request for payment, covering 65 milestones and targets. These i

### Chunk 

In [6]:
chunker = Chunker(conf=chunker_conf)

docs = chunker.chunk_documents(docs)

for i in range(len(docs)):
    print(f"Number of chunks in doc {i}: {len(docs[i].chunks)}")

Number of chunks in doc 0: 10
Number of chunks in doc 1: 2


### Embed

In [7]:
embedder = ChunkEmbedder(conf=embedder_conf)

docs = embedder.embed_documents_chunks(docs)

### Extract Nodes and Relationships from Chunk
Here, we use the `GraphMiner` class to extract a graph of relationships and entities from each chunk of the press releases. Say we don't have any experience in this domain, so that we leave the `Ontology` class empty. 

Since we are using a local Llama version, it might take a while.. 

In [8]:
graph_miner = GraphMiner(
    conf=llm_conf, 
    ontology=None
)

docs = graph_miner.mine_graph_from_docs(docs=docs)

Error while mining graph: list index out of range
Error while extracting graph: 1 validation error for _Graph
relationships.1.properties.milestones
  Input should be a valid string [type=string_type, input_value=42, input_type=int]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type
Error while mining graph: 'NoneType' object has no attribute 'nodes'
Error while extracting graph: 1 validation error for _Graph
relationships.1.properties.milestones
  Input should be a valid string [type=string_type, input_value=39, input_type=int]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type
Error while mining graph: 'NoneType' object has no attribute 'nodes'
Error while mining graph: list index out of range
Error while mining graph: list index out of range
Error while mining graph: list index out of range
Error while mining graph: list index out of range
Error while mining graph: list index out of range


In [9]:
docs[0].chunks[0].nodes

[Node(id='Czechia', type='Country', properties={'reforms': 'amendments to the law on e-government', 'name': 'Czechia'}),
 Node(id='Germany', type='Country', properties={'name': 'Germany'}),
 Node(id='Italy', type='Country', properties={'name': 'Italy'}),
 Node(id='Portugal', type='Country', properties={'name': 'Portugal'}),
 Node(id='Romania', type='Country', properties={'name': 'Romania'})]

In [10]:
docs[0].chunks[0].relationships

[Relationship(source=Node(id='Czechia', type='Country', properties={'reforms': 'amendments to the law on e-government', 'name': 'Czechia'}), target=Node(id='Germany', type='Country', properties={'name': 'Germany'}), type='Payment', properties={'amount': '€1.5 billion'}),
 Relationship(source=Node(id='Czechia', type='Country', properties={'reforms': 'amendments to the law on e-government', 'name': 'Czechia'}), target=Node(id='Italy', type='Country', properties={'name': 'Italy'}), type='Payment', properties={'amount': '€200 million'}),
 Relationship(source=Node(id='Czechia', type='Country', properties={'reforms': 'amendments to the law on e-government', 'name': 'Czechia'}), target=Node(id='Romania', type='Country', properties={'name': 'Romania'}), type='Payment', properties={'amount': '€1.7 billion'}),
 Relationship(source=Node(id='Germany', type='Country', properties={'name': 'Germany'}), target=Node(id='Italy', type='Country', properties={'name': 'Italy'}), type='Payment', properties={

### Knowledge Graph Interaction

In [32]:
knowledge_graph = KnowledgeGraph(
    conf=kg_config, 
    embeddings_model=embedder.embeddings
)

In [33]:
knowledge_graph._driver.verify_connectivity()

knowledge_graph._driver.verify_authentication()

True

In [13]:
knowledge_graph.add_documents(docs)

In [14]:
knowledge_graph.index_name

'vectors'

In [15]:
knowledge_graph.number_of_labels

9

In [34]:
knowledge_graph.number_of_nodes

26

In [16]:
knowledge_graph.number_of_relationships

97

### Results in Neo4j Aura

Connecting to Neo4j Aura confirms visually what we already know: a Knowledge Graph made of Documents, Chunks and the entities + relationships mentioned in those Chunks has been created and it's ready to be queried either via **vector search** or via **graph search**, or even employing a **hybrid approach**.

![img](assets/graph_from_press_releases.png)