In [None]:
import os
from dotenv import load_dotenv

from src.agents.graph_qa import GraphAgentResponder
from src.config import LLMConf, EmbedderConf, KnowledgeGraphConfig, ChunkerConf, Source
from src.graph.graph_model import Ontology
from src.graph.knowledge_graph import KnowledgeGraph
from src.ingestion.local_ingestor import LocalIngestor
from src.ingestion.chunker import Chunker
from src.ingestion.cleaner import Cleaner
from src.ingestion.embedder import ChunkEmbedder
from src.ingestion.graph_miner import GraphMiner

env = load_dotenv('config.env')

In this notebook, we try to force the Relationship Extractor LLM to be more strict and adhere to the `Ontology` we pass to it

## Configuration

In [None]:
# EU press release ontology
ontology = Ontology(
    allowed_labels=[
        "Policy",
        "Project",
        "Commissioner",
        "Institution",
        "Country",
        "Region",
        "Organization",
        "Event",
        "Legislation",
        "Topic",
        "FundingProgram",
        "Decision",
        "Quote"
    ],
    labels_descriptions={
        "Policy": "A specific policy, strategy, or initiative introduced by the European Commission or another EU Institution.",
        "Project": "A specific project or initiative introduced by the European Commission or another EU Institution.",
        "Commissioner": "A member of the European Commission responsible for a particular policy area.",
        "Institution": "An institution of the European Union, such as the European Parliament or the Council of the EU.",
        "Country": "A member state of the European Union or a country referenced in a given Document.",
        "Region": "A specific geographic region mentioned in the Document.",
        "Organization": "An external entity, such as NGOs, companies, or research institutions mentioned in the Document.",
        "Event": "A conference, meeting, or summit relevant to EU affairs.",
        "Legislation": "A regulation, directive, or legal framework issued or proposed by the EU.",
        "Topic": "A general subject area covered by the Document, such as 'climate change' or 'digital economy'.",
        "FundingProgram": "An EU funding initiative or program, such as Horizon Europe or Erasmus+.",
        "Decision": "An official decision made by the European Commission or an EU body.",
        "Quote": "A direct quote from an EU official or other relevant stakeholder."
    },
    allowed_relations=[
        "ISSUED_BY",       
        "ANNOUNCES",        
        "QUOTE_FROM",      # Quote -> Commissioner / Organization / EUInstitution
        "PROPOSED_BY",     # Policy / Legislation -> Commissioner / EUInstitution
        "AFFECTS",         # Policy / Decision / Legislation -> Country / Region / Organization
        "FUNDED_BY",       # Policy / Project -> FundingProgram
        "ATTENDED_BY",     # Event -> Commissioner / Organization / Country
        "RELATED_TO"       # Any concept -> Any related concept
    ]
)

In [None]:
source = Source(folder='source_docs')

chunker_conf = ChunkerConf(
    type="recursive", 
    chunk_size=1000, 
    chunk_overlap=100
)

kg_config = KnowledgeGraphConfig(
    uri=os.getenv("NEO4J_URI"),
    user=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD"),
    index_name=os.getenv("INDEX_NAME"),
    ontology=ontology
)

embedder_conf = EmbedderConf(
    type="ollama",
    model="mxbai-embed-large",
)

re_model_conf=LLMConf(
    type=os.getenv("RE_MODEL_TYPE"),
    model=os.getenv("RE_MODEL_NAME"), 
    temperature=os.getenv("RE_MODEL_TEMPERATURE"), 
    deployment=os.getenv("RE_MODEL_DEPLOYMENT"),
    api_key=os.getenv("RE_API_KEY"),
    endpoint=os.getenv("RE_MODEL_ENDPOINT"),
    api_version=os.getenv("RE_MODEL_API_VERSION") or None
)

qa_model=LLMConf(
    type=os.getenv("QA_MODEL_TYPE"),
    model=os.getenv("QA_MODEL_NAME"), 
    temperature=os.getenv("QA_MODEL_TEMPERATURE"), 
    deployment=os.getenv("QA_MODEL_DEPLOYMENT"),
    api_key=os.getenv("QA_API_KEY"),
    endpoint=os.getenv("QA_MODEL_ENDPOINT"),
    api_version=os.getenv("QA_MODEL_API_VERSION") or None
)

In [None]:
ingestor = LocalIngestor(source=source)
cleaner = Cleaner()
chunker = Chunker(conf=chunker_conf)
embedder = ChunkEmbedder(conf=embedder_conf)

graph_miner = GraphMiner(
    conf=re_model_conf, 
    ontology=ontology
)

In [None]:
knowledge_graph = KnowledgeGraph(
    conf=kg_config, 
    embeddings_model=embedder.embeddings
)

knowledge_graph._driver.verify_connectivity()

knowledge_graph._driver.verify_authentication()

## Ingestion

In [None]:
docs = ingestor.batch_ingest()

In [None]:
docs = cleaner.clean_documents(docs)

In [None]:
docs = chunker.chunk_documents(docs)

for i in range(len(docs)):
    print(f"Number of chunks in doc {i}: {len(docs[i].chunks)}")

In [None]:
docs = embedder.embed_documents_chunks(docs)

In [None]:
docs = graph_miner.mine_graph_from_docs(docs=docs)

In [None]:
knowledge_graph.add_documents(docs)

### Verify Ingestion

In [None]:
knowledge_graph.number_of_docs

In [None]:
knowledge_graph.number_of_labels

In [None]:
knowledge_graph.labels

In [None]:
knowledge_graph.number_of_relationships

In [None]:
knowledge_graph.relationships

### Query the KnowledgeGraph

In [None]:
responder = GraphAgentResponder(
    qa_llm_conf=qa_model, # TODO try different LLMs
    cypher_llm_conf=qa_model,
    graph=knowledge_graph,
    rephrase_llm_conf=qa_model
)

In [None]:
query = "What countries are mentioned in the graph?"
responder.answer(query)

In [None]:
query = "What document mentions Ukraine?"
responder.answer(query)

In [None]:
query = "Why are Italy and Portugal mentioned?"
responder.answer(query)

In [None]:
query = "What happened in the baltic sea?"
responder.answer(query)

In [None]:
query = "When was the incident in the baltic sea?"
responder.answer(query)

In [None]:
query = "What is the name of the document mentioning Italy?"
responder.answer(query)

In [None]:
query = "What document mentions  the Decarbonisation Of Public Transport?"
responder.answer(query)

In [None]:
query = "What is the name of the document that mentions the Decarbonisation Of Public Transport?"
responder.answer(query)

In [None]:
query = "How many chunks mentions Italy?"
responder.answer(query)