## 3. Ne4j Cypher generation using Leiden-algorithm generated community levels

In [None]:

## Load packages

import asyncio
import pandas as pd
import os
import openai
from openai import OpenAI
from dotenv import load_dotenv
from sqlalchemy import create_engine
from tqdm import tqdm
from string import Template
import json 
from neo4j import GraphDatabase
from timeit import default_timer as timer 
from time import sleep 
#import nest_asyncio
#import sql
import psycopg2
from pprint import pprint
import igraph as ig
import leidenalg as la
import re

In [None]:
## OpenAI API credentials
load_dotenv()

openai_api_key= os.getenv('OPENAI_API_KEY')

## Neo4j credentials

neo4j_url = os.getenv("NEO4J_URI")
neo4j_user = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")
gds = GraphDatabase.driver(neo4j_url, auth = (neo4j_user, neo4j_password))

In [1]:
import json
import igraph as ig
from leidenalg import find_partition, ModularityVertexPartition

def load_json(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def extract_entities_and_relationships(file_path):
    # Extract entities and relationships from JSON data. 
    data = load_json(file_path)
    entities = {}
    edges = []
    relationship_types = {}

    # Iterate over each graph in the JSON data
    for graph in data:
        if graph:
        # Create nodes for each entity
            for entity in graph.get('entities', []):
                entities[entity['id']] = entity

            # Create edges and store relationship types
            for relationship in graph.get('relationships', []):
                source, relation, target = relationship.split('[')[0], relationship.split('[')[1][:-1], relationship.split(']')[1]
                edges.append((source, target))
                relationship_types[(source, target)] = relation

    return entities, edges, relationship_types

def create_graph(entities, edges):
    """Create a graph using igraph from entities and edges."""
    g = ig.Graph()

    # Add vertices
    g.add_vertices(list(entities.keys()))

    # Add edges (only if both source and target exist in the graph)
    valid_edges = [(source, target) for source, target in edges if source in entities and target in entities]
    g.add_edges(valid_edges)

    return g

def apply_leiden_algorithm(graph):
    """Apply the Leiden algorithm to detect communities."""
    partition = find_partition(graph, ModularityVertexPartition)
    community_labels = {graph.vs[i]['name']: partition.membership[i] for i in range(len(graph.vs))}
    print(community_labels)
    return community_labels

def generate_cypher_statements(entities, edges, community_labels, relationship_types):
    """Generate Cypher statements to ingest data into Neo4j."""
    cypher_statements = []

    # Create nodes
    for entity_id, entity_data in entities.items():
        properties = ", ".join([f"{key}: '{value}'" for key, value in entity_data.items() if value])
        community_label = community_labels[entity_id]
        cypher_statements.append(f"MERGE (n:{entity_data['label']} {{id: '{entity_id}'}}) SET n += {{{properties}}}, n.community = {community_label};")

    # Create relationships
    for source, target in edges:
        relationship_type = relationship_types.get((source, target), None)
        if relationship_type:
            cypher_statements.append(f"MATCH (a {{id: '{source}'}}), (b {{id: '{target}'}}) MERGE (a)-[:{relationship_type}]->(b);")

    return cypher_statements

def process_graph_data(file_path):
    # Extract entities and relationships
    entities, edges, relationship_types = extract_entities_and_relationships(file_path)

    # Create graph and apply Leiden algorithm
    graph = create_graph(entities, edges)
    community_labels = apply_leiden_algorithm(graph)

    # Generate Cypher statements
    cypher_statements = generate_cypher_statements(entities, edges, community_labels, relationship_types)

    return cypher_statements
        

process_graph_data('files/entities_and_relations_2.json') #Note that this file must contain the entities and relationships extracted documents in the psql database. REPLACE WITH THE CORRECT FILE PATH FOR YOUR TASK.


KeyboardInterrupt: 

In [None]:
# Execute the generated Cypher queries
def ingestion_pipeline_with_communities(cypher_statements):
    for stat in cypher_statements:
        try:
            gds.execute_query(stat)
            print(f"Executed: {stat}")
        except Exception as e:
            with open("files/failed_statements.txt", "a") as f:
                f.write(f"{stat} - Exception: {e}\n")

cypher_statements = process_graph_data('files/entities_and_relations_2.json')

# Run the ingestion pipeline
ingestion_pipeline_with_communities(cypher_statements)

{'predictingOurClimateFuture': 241, 'davidStainforth': 241, 'lseEventDavidStainforth': 241, 'wellbeingAndMentalHealthPolicy': 15, 'disabilityPolicy': 15, 'staffCounsellingServices': 15, 'employeeAssistanceProgramme': 15, 'staffWellbeingWebpages': 15, 'studentServicesCentre': 0, 'temporaryCounterSpace': 498, 'departmentOfSocialPolicy': 7, 'profJohnHills': 7, 'profEileenMunro': 7, 'profTimNewburn': 7, 'lse': 26, 'departmentOfEqualityRightsAndCitizenship': 195, 'lgbtHistoryResearch': 195, 'lgbtHistoryArchives': 195, 'lgbtHistoryExhibit': 195, 'supportForCarers': 15, 'employersForCarersMembership': 15, 'flexibleWorkingToolkit': 15, 'timeOffForEmergencies': 15, 'compassionateLeave': 15, 'behaviouralLab': 14, 'participantDatabase': 14, 'behaviouralResearchStudies': 14, 'feministPeaceActivism': 326, 'centuryOfFeministPeaceActivismPublication': 326, 'departmentOfGeographyAndEnvironment': 3, 'departmentOfMethodology': 35, 'anthropologyDepartment': 19, 'schoolOfPublicPolicy': 20, 'internationalR