In [1]:
from dotenv import load_dotenv
import os
import textwrap
import json
from pathlib import Path
from typing import Dict, Any, List, Tuple

load_dotenv()

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import GraphCypherQAChain
from langchain_openai import ChatOpenAI

# Warning control
import warnings
warnings.filterwarnings("ignore")

## Create a graph from the RVT data

In [None]:
path_to_graph_data = '../data/processed/rvt/case-autcon'

RELATION_LABELS: Dict[str, str] = {
    'e-adjacent_connectivity': 'ADJACENT',
    'e-spatial_containment': 'CONTAINED',
    'e-structural_support': 'SUPPORTS',
    'e-accessible_connectivity': 'ACCESSIBLE',
    'e-locational_alignment': 'ALIGNED',
}

def load_json_files(data_dir: str) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, List[Tuple[str, str]]]]:
    """Load all vertex and edge JSON files from the data directory.
    Returns:
        - all_vertices: node_id -> properties (with 'node_type')
        - all_edges: relation_label -> list of (from_id, to_id)
    """
    data_path = Path(data_dir)
    
    # Find all JSON files
    vertex_files = sorted(data_path.glob('v-*.json'))
    edge_files = sorted(data_path.glob('e-*.json'))
    
    print(f"Found {len(vertex_files)} vertex files and {len(edge_files)} edge files")
    
    # Load all vertex files into a single dictionary
    all_vertices: Dict[str, Dict[str, Any]] = {}
    
    for v_file in vertex_files:
        vertex_type = v_file.stem[2:]  # Remove 'v-' prefix
        with open(v_file, 'r') as f:
            vertices = json.load(f)
            for node_id, properties in vertices.items():
                properties['node_type'] = vertex_type
                all_vertices[node_id] = properties
        
        print(f"  Loaded {len(vertices)} {vertex_type} nodes")
    
    # Load all edge files, grouping by semantic relation label inferred from filename
    all_edges: Dict[str, List[Tuple[str, str]]] = {}
    for e_file in edge_files:
        rel_label = RELATION_LABELS.get(e_file.stem)
        if rel_label is None:
            print(f"  Skipping unknown edge file type: {e_file.name}")
            continue
        
        with open(e_file, 'r') as f:
            edges_obj = json.load(f)
            # Each key inside is a node-type pair (e.g., 'space-stair'); we ignore the key
            # and collect all pairs under the semantic relation label.
            total_in_file = 0
            for _, edge_list in edges_obj.items():
                if rel_label not in all_edges:
                    all_edges[rel_label] = []
                all_edges[rel_label].extend((frm, to) for frm, to in edge_list)
                total_in_file += len(edge_list)
        
        print(f"  Loaded {total_in_file} '{rel_label}' edges from {e_file.name}")
    
    print(f"\nTotal nodes: {len(all_vertices)}")
    print(f"Total relation labels: {len(all_edges)}")
    print(f"Total relationships: {sum(len(edges) for edges in all_edges.values())}")
    
    return all_vertices, all_edges

# Load all data
all_vertices, all_edges = load_json_files(path_to_graph_data)


Found 8 vertex files and 5 edge files
  Loaded 120 column nodes
  Loaded 135 door nodes
  Loaded 19 separationline nodes
  Loaded 5 slab nodes
  Loaded 108 space nodes
  Loaded 7 stair nodes
  Loaded 191 wall nodes
  Loaded 87 window nodes
  Loaded 582 'ACCESSIBLE' edges from e-accessible_connectivity.json
  Loaded 1294 'ADJACENT' edges from e-adjacent_connectivity.json
  Loaded 673 'ALIGNED' edges from e-locational_alignment.json
  Loaded 289 'CONTAINED' edges from e-spatial_containment.json
  Loaded 1769 'SUPPORTS' edges from e-structural_support.json

Total nodes: 672
Total relation labels: 5
Total relationships: 4607


In [6]:
# Connect to Neo4j using Langchain Neo4jGraph
graph = Neo4jGraph(
    url=os.getenv("NEO4J_URI", "bolt://localhost:7687"),
    username=os.getenv("NEO4J_USER", "neo4j"),
    password=os.getenv("NEO4J_PASSWORD", 'password'),
    database="neo4j"
)

def clear_database(graph: Neo4jGraph) -> int:
    """Clear all nodes and relationships from the database."""
    graph.query("MATCH (n) DETACH DELETE n")
    result = graph.query("MATCH (n) RETURN count(n) as count")
    return result[0]["count"] if result else 0

def create_nodes_batch(graph: Neo4jGraph, nodes_batch: List[Tuple[str, Dict[str, Any]]]) -> int:
    """Create multiple nodes in a single batch."""
    created = 0
    
    for node_id, properties in nodes_batch:
        node_type = properties.get('node_type', 'Entity')
        label = node_type.capitalize()
        
        # Build properties for the query
        props = {k: v for k, v in properties.items() if k != 'node_type'}
        props['id'] = node_id
        
        # Create Cypher query with parameters (label must be in query string, not parameterized)
        query = f"CREATE (n:{label} $props)"
        
        # Execute query with parameters
        graph.query(query, params={"props": props})
        created += 1
    
    return created

def create_relationships_batch(graph: Neo4jGraph, edges_batch: List[Tuple[str, str, str]]) -> int:
    """Create multiple relationships in a single batch."""
    created = 0
    
    for from_id, to_id, edge_type in edges_batch:
        rel_label = edge_type.upper().replace('-', '_')
        
        # Relationship type must be in query string, not parameterized
        query = f"""
        MATCH (a) WHERE a.id = $from_id
        MATCH (b) WHERE b.id = $to_id
        CREATE (a)-[r:{rel_label}]->(b)
        """
        
        graph.query(query, params={"from_id": from_id, "to_id": to_id})
        created += 1
    
    return created


# Process nodes
print("Creating nodes...")
# Clear existing data
print("Clearing existing database...")
count = clear_database(graph)
print(f"Deleted {count} existing nodes")

# Create all nodes in batches
batch_size = 100
node_count = 0
nodes_list = list(all_vertices.items())

for i in range(0, len(nodes_list), batch_size):
    batch = nodes_list[i:i+batch_size]
    create_nodes_batch(graph, batch)
    node_count += len(batch)
    if node_count % 500 == 0:
        print(f"  Created {node_count}/{len(nodes_list)} nodes...")

print(f"Created {node_count} nodes")

# Create all relationships in batches
print("\nCreating relationships...")
rel_count = 0
edges_list = []

for edge_type, edges in all_edges.items():
    print(f"  Preparing {edge_type} relationships ({len(edges)} edges)...")
    for from_id, to_id in edges:
        edges_list.append((from_id, to_id, edge_type))

print(f"Creating {len(edges_list)} total relationships in batches...")
for i in range(0, len(edges_list), batch_size):
    batch = edges_list[i:i+batch_size]
    create_relationships_batch(graph, batch)
    rel_count += len(batch)
    if rel_count % 500 == 0:
        print(f"  Created {rel_count}/{len(edges_list)} relationships...")

print(f"Created {rel_count} relationships")
print("\nDone! Data successfully loaded into Neo4j.")

Creating nodes...
Clearing existing database...
Deleted 0 existing nodes
  Created 500/672 nodes...
Created 672 nodes

Creating relationships...
  Preparing ACCESSIBLE relationships (582 edges)...
  Preparing ADJACENT relationships (1294 edges)...
  Preparing ALIGNED relationships (673 edges)...
  Preparing CONTAINED relationships (289 edges)...
  Preparing SUPPORTS relationships (1769 edges)...
Creating 4607 total relationships in batches...
  Created 500/4607 relationships...
  Created 1000/4607 relationships...
  Created 1500/4607 relationships...
  Created 2000/4607 relationships...
  Created 2500/4607 relationships...
  Created 3000/4607 relationships...
  Created 3500/4607 relationships...
  Created 4000/4607 relationships...
  Created 4500/4607 relationships...
Created 4607 relationships

Done! Data successfully loaded into Neo4j.


In [7]:
from typing import List

# Verify the data was loaded successfully using Neo4jGraph
def get_stats(graph: Neo4jGraph) -> tuple:
    """Get statistics about the loaded graph."""
    # Count nodes by label
    node_counts = graph.query("""
        MATCH (n)
        WITH labels(n) as labels
        UNWIND labels as label
        RETURN label, count(*) as count
        ORDER BY count DESC
    """)
    
    # Count relationships by type
    rel_counts = graph.query("""
        MATCH ()-[r]->()
        RETURN type(r) as rel_type, count(*) as count
        ORDER BY count DESC
    """)
    
    return node_counts, rel_counts

def get_sample_relationships(graph: Neo4jGraph, rel_type: str, limit: int = 5) -> List:
    """Get sample relationships of a given type."""
    query = f"""
    MATCH (a)-[r:{rel_type}]->(b)
    RETURN a.id as from_id, b.id as to_id, 
           labels(a)[0] as from_type, labels(b)[0] as to_type
    LIMIT $limit
    """
    return graph.query(query, params={"limit": limit})

# Get node statistics
node_counts, rel_counts = get_stats(graph)

print("=== NODE STATISTICS ===")
total_nodes = sum(row["count"] for row in node_counts)
print(f"Total nodes: {total_nodes}")
for row in node_counts:
    print(f"  {row['label']}: {row['count']}")

print("\n=== RELATIONSHIP STATISTICS ===")
total_rels = sum(row["count"] for row in rel_counts)
print(f"Total relationships: {total_rels}")
for row in rel_counts:
    print(f"  {row['rel_type']}: {row['count']}")

# Show sample relationships
if rel_counts:
    first_rel_type = rel_counts[0]["rel_type"]
    print(f"\n=== SAMPLE RELATIONSHIPS (type: {first_rel_type}) ===")
    samples = get_sample_relationships(graph, first_rel_type, 5)
    for row in samples:
        print(f"  {row['from_type']}({row['from_id'][:10]}...) -> {row['to_type']}({row['to_id'][:10]}...)")

=== NODE STATISTICS ===
Total nodes: 672
  Wall: 191
  Door: 135
  Column: 120
  Space: 108
  Window: 87
  Separationline: 19
  Stair: 7
  Slab: 5

=== RELATIONSHIP STATISTICS ===
Total relationships: 4607
  SUPPORTS: 1769
  ADJACENT: 1294
  ALIGNED: 673
  ACCESSIBLE: 582
  CONTAINED: 289

=== SAMPLE RELATIONSHIPS (type: SUPPORTS) ===
  Column(2683100...) -> Column(2683143...)
  Column(2683101...) -> Column(2683144...)
  Column(2683103...) -> Column(2683146...)
  Column(2610245...) -> Column(2683119...)
  Column(2572075...) -> Column(2601466...)


## Query the graph

In [2]:
# Connect to Neo4j using Langchain Neo4jGraph
graph = Neo4jGraph(
    url=os.getenv("NEO4J_URI", "bolt://localhost:7687"),
    username=os.getenv("NEO4J_USER", "neo4j"),
    password=os.getenv("NEO4J_PASSWORD", 'password'),
    database="neo4j"
)

In [3]:
schema_json = graph.query("""CALL apoc.meta.schema() YIELD value
UNWIND keys(value) AS name
WITH name, value[name] AS data
WHERE data.type IN ['node', 'relationship']
RETURN 
    name, 
    data.type AS type, 
    keys(data.properties) AS properties
ORDER BY type, name
""")

schema_json


[{'name': 'Column',
  'type': 'node',
  'properties': ['ifc_guid', 'location', 'id', 'level_id']},
 {'name': 'Door',
  'type': 'node',
  'properties': ['ifc_guid', 'location', 'id', 'level_id']},
 {'name': 'Separationline',
  'type': 'node',
  'properties': ['ifc_guid', 'location', 'id', 'level_id']},
 {'name': 'Slab',
  'type': 'node',
  'properties': ['ifc_guid', 'location', 'id', 'level_id']},
 {'name': 'Space',
  'type': 'node',
  'properties': ['ifc_guid',
   'bound_phy',
   'number',
   'bound_vrt',
   'level_id',
   'width',
   'length',
   'name',
   'location',
   'id']},
 {'name': 'Stair',
  'type': 'node',
  'properties': ['ifc_guid',
   'top_elevation',
   'location',
   'id',
   'base_elevation',
   'horizontal_dimensions']},
 {'name': 'Wall',
  'type': 'node',
  'properties': ['ifc_guid',
   'location',
   'is_room_bounding',
   'id',
   'level_id']},
 {'name': 'Window',
  'type': 'node',
  'properties': ['ifc_guid', 'location', 'id', 'level_id']},
 {'name': 'ACCESSIBLE',

In [4]:
all_nodes_query = """MATCH (n)
    WHERE size(labels(n)) > 0 
    WITH labels(n)[0] AS label, collect(n) AS nodes
    RETURN label, nodes[0] AS sampleNode"""

all_nodes =graph.query(all_nodes_query)

all_nodes

[{'label': 'Column',
  'sampleNode': {'ifc_guid': '2wwkhajRz9dOXaXiRcjNmt',
   'level_id': '79627',
   'location': [96.06087148282784, 102.24828131414463, 24.27821522309711],
   'id': '2683100'}},
 {'label': 'Door',
  'sampleNode': {'ifc_guid': '2P0Clh2c952RPyuPFEYgcQ',
   'level_id': '24770',
   'location': [28.524353627965954, 43.365276144551295, 0.0],
   'id': '2642808'}},
 {'label': 'Separationline',
  'sampleNode': {'ifc_guid': '1yPJv2Vxf8r9twVNmFuZ2v',
   'level_id': '2680262',
   'location': [39.08435362796594,
    27.16527614455126,
    11.1,
    39.08435362796593,
    21.56527614455119,
    11.1],
   'id': '2779188'}},
 {'label': 'Slab',
  'sampleNode': {'ifc_guid': '2wwkhajRz9dOXaXiRcjMhM',
   'level_id': '2680262',
   'location': [34.269353627965906, 27.47527614455132, 10.950000000000003],
   'id': '2680381'}},
 {'label': 'Space',
  'sampleNode': {'ifc_guid': '3RD7PAHIz7R9lDzWO$cjQK',
   'bound_phy': ['2670852', '2670853', '2534518', '2535474'],
   'number': '110',
   'bound

In [5]:
issues = '/Users/stefanfuchs/Repos/acc-recomm/data/processed/rule/case-autcon/issues.json'

with open(issues, 'r') as f:
    issues = json.load(f)

issues[:2]

[{'regulation_clause': '1007_1_1',
  'core_component_type': 'Conference Room',
  'core_component_type_number': '306',
  'core_component_GUID': '1W6yAWYtbDbPaMjexfopdL',
  'checking_variable_name': 'Required door separation',
  'checking_variable_unit': 'feet',
  'required_variable_value': '20.82',
  'actual_variable_value': '17.37',
  'bcf_id': 'df6a3d56-8685-4675-b5a2-7650e85088db'},
 {'regulation_clause': '1007_1_1',
  'core_component_type': 'Conference Room',
  'core_component_type_number': '307',
  'core_component_GUID': '1W6yAWYtbDbPaMjexfopdJ',
  'checking_variable_name': 'Required door separation',
  'checking_variable_unit': 'feet',
  'required_variable_value': '24.51',
  'actual_variable_value': '17.40',
  'bcf_id': '21a38776-fb87-4b71-b5c2-e92e23f2182a'}]

In [6]:
OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')
OPENROUTER_BASE_URL = os.getenv('OPENROUTER_BASE_URL', 'https://openrouter.ai/api/v1')

CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to 
query a graph database.
Instructions:
Use only the provided relationship types and properties in the 
schema. Do not use any other relationship types or properties that 
are not provided.
Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than 
for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.
Be specific with your Cypher statements. You can make it less restrictive in a second step if no results are found.

For example, if one wants to retrieve examples for all the nodes, one could use the following Cypher statement:
- Example query: {all_nodes_query}
- Example result: {all_nodes}


The question is:
{question}"""

In [7]:
CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question", "all_nodes_query", "all_nodes"],
    template=CYPHER_GENERATION_TEMPLATE
)

llm = ChatOpenAI(temperature=0, openai_api_key=OPENROUTER_API_KEY, model="gpt-4o-mini", base_url=OPENROUTER_BASE_URL)

cypherChain = GraphCypherQAChain.from_llm(
    llm,
    allow_dangerous_requests=True,
    graph=graph,
    verbose=True,
    return_intermediate_steps=True,
    cypher_prompt=CYPHER_GENERATION_PROMPT,
)

def prettyCypherChain(question: str) -> str:
    response = cypherChain.invoke({"query": question,
    "all_nodes_query": all_nodes_query,
    "all_nodes": str(all_nodes)})
    print(textwrap.fill(str(response), 160))

In [8]:
prettyCypherChain(f"I've got the following issue: {issues[1]}. How could I fix it?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (s:Space {id: '307'})-[:ACCESSIBLE]->(d:Door)
WHERE d.ifc_guid = '1W6yAWYtbDbPaMjexfopdJ'
RETURN s, d
[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m
{'query': "I've got the following issue: {'regulation_clause': '1007_1_1', 'core_component_type': 'Conference Room', 'core_component_type_number': '307',
'core_component_GUID': '1W6yAWYtbDbPaMjexfopdJ', 'checking_variable_name': 'Required door separation', 'checking_variable_unit': 'feet',
'required_variable_value': '24.51', 'actual_variable_value': '17.40', 'bcf_id': '21a38776-fb87-4b71-b5c2-e92e23f2182a'}. How could I fix it?",
'all_nodes_query': 'MATCH (n)\n    WHERE size(labels(n)) > 0 \n    WITH labels(n)[0] AS label, collect(n) AS nodes\n    RETURN label, nodes[0] AS sampleNode',
'all_nodes': "[{'label': 'Column', 'sampleNode': {'ifc_guid': '2wwkhajRz9dOXaXiRcjNmt', 'level_id': '79627', 'location': [96.06087148282784, 1

In [None]:
from langchain.agents import Tool, initialize_agent, AgentType

tool = Tool(
    name="graph-cypher-qa",
    func=lambda q: cypherChain.invoke({"query": q,
    "all_nodes_query": all_nodes_query,
    "all_nodes": str(all_nodes)}),
    description="Answer questions about the building graph using Cypher."
)

agent = initialize_agent(
    tools=[tool],
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    max_iterations=4,
    early_stopping_method="generate"
)

result = agent.run(f"""I've got the following issue: {issues[1]}. How could I fix it? Provide a high-level description of what elements of the building are involved and what type of action is needed. 
        Note: Ignore the naming convention of the issue, just focus on the graph schema when writing the Cypher query.""")
print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo address the issue regarding the required door separation in the conference room, I need to identify the relevant elements in the building graph that pertain to the conference room and the door separation requirement. This will involve querying the graph to find the conference room component and any associated door elements that may need to be adjusted to meet the required separation distance.

Action: graph-cypher-qa
Action Input: "MATCH (c:CoreComponent {type: 'Conference Room', number: '307'})-[:HAS_DOOR]->(d:Door) RETURN c, d"[0m

[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (s:Space {number: '307'})-[:ACCESSIBLE]->(d:Door) RETURN s, d[0m
Full Context:
[32;1m[1;3m[{'s': {'ifc_guid': '1W6yAWYtbDbPaMjexfopdJ', 'bound_phy': ['2682946', '2682934', '2682935', '2682933'], 'number': '307', 'bound_vrt': [], 'level_id': '2680262', 'width': 4.8000000000000265, 'length': 14.1500000000

In [13]:
result = agent.run(f"""I've got the following issue: {issues[-1]}. How could I fix it? Provide a high-level description of what elements of the building are involved and what type of action is needed. 
        Note: Ignore the naming convention of the issue, just focus on the graph schema when writing the Cypher query.""")
print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo address the issue, I need to identify the relevant elements in the building graph that correspond to the provided details. The issue involves a corridor component with a specific width requirement that is not being met. I will look for the corridor component in the graph and check its properties, particularly focusing on the width. 

Action: graph-cypher-qa  
Action Input: "MATCH (c:CoreComponent {type: 'Corridor', GUID: '3O7NDSvVP7jhJX5Gt$LDET'}) RETURN c"  [0m

[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (c) WHERE size(labels(c)) > 0 WITH labels(c)[0] AS label, collect(c) AS nodes RETURN label, nodes[0] AS sampleNode[0m
Full Context:
[32;1m[1;3m[{'label': 'Column', 'sampleNode': {'ifc_guid': '2wwkhajRz9dOXaXiRcjNmt', 'level_id': '79627', 'location': [96.06087148282784, 102.24828131414463, 24.27821522309711], 'id': '2683100'}}, {'label': 'Door', 'sampleNode': {'ifc_guid': '2

In [None]:
# Need to get json objects that need to be modified. Multiple change proposals.

In [12]:
issues[:20]

[{'regulation_clause': '1007_1_1',
  'core_component_type': 'Conference Room',
  'core_component_type_number': '306',
  'core_component_GUID': '1W6yAWYtbDbPaMjexfopdL',
  'checking_variable_name': 'Required door separation',
  'checking_variable_unit': 'feet',
  'required_variable_value': '20.82',
  'actual_variable_value': '17.37',
  'bcf_id': 'df6a3d56-8685-4675-b5a2-7650e85088db'},
 {'regulation_clause': '1007_1_1',
  'core_component_type': 'Conference Room',
  'core_component_type_number': '307',
  'core_component_GUID': '1W6yAWYtbDbPaMjexfopdJ',
  'checking_variable_name': 'Required door separation',
  'checking_variable_unit': 'feet',
  'required_variable_value': '24.51',
  'actual_variable_value': '17.40',
  'bcf_id': '21a38776-fb87-4b71-b5c2-e92e23f2182a'},
 {'regulation_clause': '1017_2',
  'core_component_type': 'Conference Room',
  'core_component_type_number': '307',
  'core_component_GUID': '1W6yAWYtbDbPaMjexfopdJ',
  'checking_variable_name': 'Travel distance to safe plac

In [None]:
from langchain.agents import Tool, initialize_agent, AgentType

# Tool returns full chain response so the agent can see intermediate steps (ids in context)
structured_tool = Tool(
    name="graph-cypher-qa",
    func=lambda q: cypherChain.invoke({
        "query": q,
        "all_nodes_query": all_nodes_query,
        "all_nodes": str(all_nodes),
    }),
    description="Answer questions about the building graph using Cypher.")

agent = initialize_agent(
    tools=[structured_tool],
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    max_iterations=4,
    early_stopping_method="generate",
)

prompt = f"""I've got the following issue: {issues[1]}.
How could I fix it? Provide a high-level description of what elements of the building are involved and what type of action is needed.

Instructions:
- Use at most 2 tool calls.
- Ensure your Cypher returns relevant node ids when possible (e.g., space.id, door.level_id).
- FINAL ANSWER MUST BE JSON ONLY with keys exactly:
  ids (array of strings), action (string), reasoning (string).
- No text outside the JSON.
"""

result = agent.run(prompt)

# Pretty-print JSON if possible
try:
    obj = json.loads(result)
    print(json.dumps(obj, indent=2))
except Exception:
    print(result)
