In [2]:
import os
from dotenv import load_dotenv

from src.config import LLMConf, EmbedderConf, KnowledgeGraphConfig
from src.factory.embeddings import get_embeddings
from src.graph.knowledge_graph import KnowledgeGraph
from neo4j import Query, Session
from src.schema import Chunk
env = load_dotenv('config.env')

from src.utils.logger import get_logger
logger = get_logger(__name__)

In [30]:
kg_config = KnowledgeGraphConfig(
    uri=os.getenv("NEO4J_URI"),
    user=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD"),
    index_name='vector'
)

embedder_conf = EmbedderConf(
    type=os.getenv("EMBEDDINGS_TYPE"),
    model=os.getenv("EMBEDDINGS_MODEL_NAME"),
)

model_conf=LLMConf(
    type=os.getenv("RE_MODEL_TYPE"),
    model=os.getenv("RE_MODEL_NAME"), 
    temperature=os.getenv("RE_MODEL_TEMPERATURE"), 
    deployment=os.getenv("RE_MODEL_DEPLOYMENT"),
    api_key=os.getenv("RE_API_KEY"),
    endpoint=os.getenv("RE_MODEL_ENDPOINT"),
    api_version=os.getenv("RE_MODEL_API_VERSION") or None
)

embeddings = get_embeddings(conf=embedder_conf)

kg = KnowledgeGraph(conf=kg_config,embeddings_model=embeddings)

In [16]:
communities = kg.get_communities()

chunks_comm_1 = communities[0].chunks

chunks_comm_1



[Chunk(chunk_id='4:44890bb9-4945-4a7c-8f87-381effaa2643:78', text='short quiz about the EU. Passes will be awarded to the top-ranked applicants until all tickets are distributed. Travellers can design their own European journey or choose from suggested routes. They may follow the New European Bauhaus Route, showcasing vibrant, sustainable cities, or choose the Green Route, taking them takes to some of the most sustainable cities and nature-friendly destinations across the continent. This route features award-winning Green Capitals and Green Leaf cities, stunning parks, and nature reserves. Along the way, they can use a discount card packed with thousands of offers on transport, museums, food, accommodation, sports and more across 36 countries. Discover EU will make this experience accessible to all. Special support is available for participants with disabilities, health conditions or fewer opportunities. Additional arrangements are also offered to those living in remote areas, islands 

In [28]:
communities

[Community(community_type='leiden', community_id=0, community_size=34, entity_ids=['4:44890bb9-4945-4a7c-8f87-381effaa2643:65', '4:44890bb9-4945-4a7c-8f87-381effaa2643:66', '4:44890bb9-4945-4a7c-8f87-381effaa2643:68', '4:44890bb9-4945-4a7c-8f87-381effaa2643:69', '4:44890bb9-4945-4a7c-8f87-381effaa2643:71', '4:44890bb9-4945-4a7c-8f87-381effaa2643:78', '4:44890bb9-4945-4a7c-8f87-381effaa2643:79', '4:44890bb9-4945-4a7c-8f87-381effaa2643:80', '4:44890bb9-4945-4a7c-8f87-381effaa2643:81', '4:44890bb9-4945-4a7c-8f87-381effaa2643:82', '4:44890bb9-4945-4a7c-8f87-381effaa2643:90', '4:44890bb9-4945-4a7c-8f87-381effaa2643:91', '4:44890bb9-4945-4a7c-8f87-381effaa2643:92', '4:44890bb9-4945-4a7c-8f87-381effaa2643:135', '4:44890bb9-4945-4a7c-8f87-381effaa2643:136', '4:44890bb9-4945-4a7c-8f87-381effaa2643:137', '4:44890bb9-4945-4a7c-8f87-381effaa2643:138', '4:44890bb9-4945-4a7c-8f87-381effaa2643:139', '4:44890bb9-4945-4a7c-8f87-381effaa2643:140', '4:44890bb9-4945-4a7c-8f87-381effaa2643:141', '4:44890bb

In [None]:
from typing import Any, Dict, List


def get_mentioned_entities(session: Session, chunk: Chunk, n_hops: int=1) -> List[Dict[str, Any]]:
    """ 
    Follows the `MENTIONS` relationships of a given Chunk in the Graph and collects mentioned entities. 
    `n_hops` is used to indicate the number of relationship layers that could be done following entities linking.  
    """
    # TODO perform n-hops retrieval
    base_query = """
        MATCH (c:Chunk)
        WHERE elementId(c) = $elementId
        MATCH (c)-[:MENTIONS]->(mentioned)
        RETURN collect(mentioned) AS mentioned_nodes
    """
    nodes = []
    result= session.run(base_query, elementId=chunk.chunk_id)
    record = result.single()
    mentioned_nodes = record["mentioned_nodes"] if record else []
    for node in mentioned_nodes:
        nodes.append(dict(node)) 
    return nodes

In [13]:
with kg._driver.session() as session:
    mentioned_ents = get_mentioned_entities(session, chunk=chunks_comm_1[0])
    
mentioned_ents

[{'community_leiden': 0,
  'closeness': 0.05023947151114781,
  'name': 'Museums',
  'id': 'Museums',
  'betweenness': 0.0,
  'pagerank': 0.004427561890955535,
  'community_louvain': 8},
 {'community_leiden': 0,
  'closeness': 0.05023947151114781,
  'name': 'Food',
  'id': 'Food',
  'betweenness': 0.0,
  'pagerank': 0.004427561890955535,
  'community_louvain': 8},
 {'community_leiden': 0,
  'closeness': 0.07834945344245407,
  'name': 'Discover Eu',
  'id': 'Discover Eu',
  'betweenness': 0.036396020970560564,
  'pagerank': 0.009603821429715718,
  'community_louvain': 8},
 {'community_leiden': 0,
  'closeness': 0.06048420876267069,
  'name': 'European Journey',
  'id': 'European Journey',
  'betweenness': 0.007259040193574405,
  'pagerank': 0.004320973116776079,
  'community_louvain': 8},
 {'community_leiden': 0,
  'closeness': 0.05023947151114781,
  'name': 'New European Bauhaus Route',
  'id': 'New European Bauhaus Route',
  'betweenness': 0.0,
  'pagerank': 0.005529420132051267,
  'co

In [None]:
from typing import Tuple


def get_adjacent_chunks(session: Session, chunk: Chunk) -> Tuple[Chunk | None, Chunk | None, Chunk | None]:
    """
    Returns a tuple with the previous , current and following `Chunk` 
    given an initial node characterised by a `filename` and a `chunk_id`
    """
    base_query = """ 
        MATCH (current:Chunk)
        WHERE elementId(current) = $elementId

        OPTIONAL MATCH (prev:Chunk)-[:NEXT]->(current)
        OPTIONAL MATCH (current)-[:NEXT]->(next:Chunk)

        RETURN prev AS previous_chunk, current, next AS next_chunk
    """
    try: 
        result = session.run(base_query, elementId=chunk.chunk_id)
        record = result.single()
        
        previous_chunk = dict(record["previous_chunk"]) if record["previous_chunk"] else None
        if previous_chunk:
            previous_chunk = Chunk(
                chunk_id=previous_chunk['chunk_id'],
                filename=previous_chunk['filename'],
                text=previous_chunk["text"],
            )
            chunk.chunk_id = previous_chunk.chunk_id + 1 # original chunk id
        next_chunk = dict(record["next_chunk"]) if record["next_chunk"] else None
        if next_chunk:
            next_chunk = Chunk(
                chunk_id=next_chunk['chunk_id'],
                filename=next_chunk['filename'],
                text=next_chunk["text"],
            )
            chunk.chunk_id = next_chunk.chunk_id-1 # original chunk id
        
        return previous_chunk, chunk, next_chunk
    except Exception as e:
        logger.warning(f"Unable to retrieve adjacent chunks for Chunk: {chunk.chunk_id}")
        return None, chunk, None


In [26]:
with kg._driver.session() as session:
    ajacent_chunks = get_adjacent_chunks(session, chunk=chunks_comm_1[1])
    
ajacent_chunks

(Chunk(chunk_id=2, text='short quiz about the EU. Passes will be awarded to the top-ranked applicants until all tickets are distributed. Travellers can design their own European journey or choose from suggested routes. They may follow the New European Bauhaus Route, showcasing vibrant, sustainable cities, or choose the Green Route, taking them takes to some of the most sustainable cities and nature-friendly destinations across the continent. This route features award-winning Green Capitals and Green Leaf cities, stunning parks, and nature reserves. Along the way, they can use a discount card packed with thousands of offers on transport, museums, food, accommodation, sports and more across 36 countries. Discover EU will make this experience accessible to all. Special support is available for participants with disabilities, health conditions or fewer opportunities. Additional arrangements are also offered to those living in remote areas, islands or outermost regions. Launched in 2018 and

In [38]:
def filter_graph_by_communities(session: Session, community_ids: List[int], community_type: str="leiden"):
    """
    Creates a temporary  view of the Knowledge Graph to filter it into subgraphs given community ids.
    """
    query = f"""
        MATCH (n)-[r]->(m)
        WHERE n.community_{community_type} IN $community_values
        RETURN n, r, m
    """
    try:
        result = session.run(query, community_values=community_ids)
        
        subgraph = []
        
        for record in result:
            # Collecting nodes and relationships as dictionaries
            subgraph.append({
                "node_1": dict(record["n"]),
                "relationship": dict(record["r"]),
                "node_2": dict(record["m"])
            })
        
        return subgraph
    
    except Exception as e:
        print(f"Error while fetching subgraph: {e}")
        return []

In [39]:
with kg._driver.session() as session:
    subgraph = filter_graph_by_communities(session, community_ids=[3, 4])
    
subgraph

[{'node_1': {'community_leiden': 4,
   'closeness': 0.009909165978530138,
   'name': 'Henna Virkkunen',
   'id': 'Henna Virkkunen',
   'betweenness': 0.0,
   'title': 'Executive Vice-President for Tech Sovereignty, Security and Democracy',
   'pagerank': 0.003977060150080188,
   'community_louvain': 4},
  'relationship': {},
  'node_2': {'community_leiden': 4,
   'closeness': 0.06556970105322883,
   'name': 'Europe',
   'id': 'Europe',
   'betweenness': 0.005567504592911235,
   'pagerank': 0.012338619258302019,
   'community_louvain': 4}},
 {'node_1': {'community_leiden': 4,
   'closeness': 0.06556970105322883,
   'name': 'Europe',
   'id': 'Europe',
   'betweenness': 0.005567504592911235,
   'pagerank': 0.012338619258302019,
   'community_louvain': 4},
  'relationship': {},
  'node_2': {'community_leiden': 4,
   'closeness': 0.054589031439447336,
   'name': 'Steel',
   'id': 'Steel',
   'betweenness': 0.0,
   'pagerank': 0.007698739564014886,
   'community_louvain': 4}},
 {'node_1': {