In [2]:
import networkx as nx
import nx_arangodb as nxadb

from arango import ArangoClient

import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from random import randint
import re

from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver
from langchain_openai import ChatOpenAI
from langchain_community.graphs import ArangoGraph
from langchain_community.chains.graph_qa.arangodb import ArangoGraphQAChain
from langchain_core.tools import tool

import gradio
from graph_utils import create_d3_visualization
from dotenv import load_dotenv
import os

[22:19:18 -0600] [INFO]: NetworkX-cuGraph is unavailable: No module named 'nx_cugraph'.


In [3]:
load_dotenv()

True

In [4]:
# Access variables
openai_key = os.getenv('OPENAI_API_KEY')
arango_host = os.getenv('ARANGO_HOST')
arango_user = os.getenv('ARANGO_USER')
arango_password = os.getenv('ARANGO_PASSWORD')
arango_db = os.getenv('ARANGO_DB')


In [5]:
# Initialize the client for ArangoDB.
client = ArangoClient(hosts=arango_host)
db = client.db(arango_db, username=arango_user, password=arango_password)
print(db.collections())

[{'id': '195136', 'name': '_analyzers', 'system': True, 'type': 'document', 'status': 'loaded'}, {'id': '195138', 'name': '_queues', 'system': True, 'type': 'document', 'status': 'loaded'}, {'id': '195137', 'name': '_aqlfunctions', 'system': True, 'type': 'document', 'status': 'loaded'}, {'id': '211044', 'name': 'Users', 'system': False, 'type': 'document', 'status': 'loaded'}, {'id': '195139', 'name': '_jobs', 'system': True, 'type': 'document', 'status': 'loaded'}, {'id': '211045', 'name': 'Games', 'system': False, 'type': 'document', 'status': 'loaded'}, {'id': '195141', 'name': '_appbundles', 'system': True, 'type': 'document', 'status': 'loaded'}, {'id': '195135', 'name': '_graphs', 'system': True, 'type': 'document', 'status': 'loaded'}, {'id': '195140', 'name': '_apps', 'system': True, 'type': 'document', 'status': 'loaded'}, {'id': '211046', 'name': 'plays', 'system': False, 'type': 'edge', 'status': 'loaded'}, {'id': '195142', 'name': '_frontend', 'system': True, 'type': 'docu

In [6]:

G_adb = nxadb.Graph(name="SteamGraph", db=db)

print(G_adb)

[22:19:23 -0600] [INFO]: Graph 'SteamGraph' exists.
[22:19:23 -0600] [INFO]: Default node type set to 'Games'


Graph named 'SteamGraph' with 14950 nodes and 70477 edges


In [7]:
graph_def = db.collection("_graphs").get("SteamGraph")
print(graph_def)


{'_key': 'SteamGraph', '_id': '_graphs/SteamGraph', '_rev': '_jPbGLl2---', 'edgeDefinitions': [{'collection': 'plays', 'from': ['Users'], 'to': ['Games']}], 'orphanCollections': [], 'networkx': {'name': 'SteamGraph'}}


In [8]:
for node, data in list(G_adb.nodes(data=True))[:10]:
    print(node, data.get("_id"))


Users/97622055 Users/97622055
Users/180063157 Users/180063157
Users/183421285 Users/183421285
Games/11982_star_conflict Games/11982_star_conflict
Users/273163848 Users/273163848
Users/227863398 Users/227863398
Users/214299751 Users/214299751
Games/14353_rwby_grimm_eclipse Games/14353_rwby_grimm_eclipse
Users/113835971 Users/113835971
Users/72978546 Users/72978546


In [9]:
print(G_adb)

Graph named 'SteamGraph' with 14950 nodes and 70477 edges


### AQL Test

In [10]:
# First, let's check what collections exist
print("Available collections:")
for collection in db.collections():
    print(collection['name'])

# Get the graph to see its collections
graph = db.graph('SteamGraph')
print("\nGraph collections:")
print(graph.edge_definitions())

Available collections:
_analyzers
_queues
_aqlfunctions
Users
_jobs
Games
_appbundles
_graphs
_apps
plays
_frontend

Graph collections:
[{'edge_collection': 'plays', 'from_vertex_collections': ['Users'], 'to_vertex_collections': ['Games']}]


In [11]:


# -----------------------------------------------------------
# 1. Query 3 random nodes from the Users collection.
# -----------------------------------------------------------
result_cursor = G_adb.query("""
    FOR node IN Users
        SORT RAND()
        LIMIT 3
        RETURN node
""")
print("Sample Users Nodes:")
for node in result_cursor:
    print(node)
print('-'*10)

# -----------------------------------------------------------
# 2. Query 3 random edges from the plays edge collection.
# -----------------------------------------------------------
result_cursor = G_adb.query("""
    FOR edge IN plays
        SORT RAND()
        LIMIT 3
        RETURN edge
""")
print("Sample Plays Edges:")
for edge in result_cursor:
    print(edge)
print('-'*10)


Sample Users Nodes:
{'_key': '99157622', '_id': 'Users/99157622', '_rev': '_jPaYbqS--p', 'type': 'Users', 'steamid': 99157622}
{'_key': '213256245', '_id': 'Users/213256245', '_rev': '_jPaYbwS--i', 'type': 'Users', 'steamid': 213256245}
{'_key': '93644606', '_id': 'Users/93644606', '_rev': '_jPaYbiC--e', 'type': 'Users', 'steamid': 93644606}
----------
Sample Plays Edges:
{'_key': '30281', '_id': 'plays/30281', '_from': 'Users/279171078', '_to': 'Games/11841_no_more_room_in_hell', '_rev': '_jPaYgv6--Q', 'weight': 0.7}
{'_key': '48115', '_id': 'plays/48115', '_from': 'Users/129117376', '_to': 'Games/12144_besiege', '_rev': '_jPaYhna--h', 'weight': 2.1}
{'_key': '24487', '_id': 'plays/24487', '_from': 'Users/104120883', '_to': 'Games/11985_firefall', '_rev': '_jPaYgee--Q', 'weight': 1.3}
----------


### Agent

In [12]:
arango_graph = ArangoGraph(db)

In [13]:
llm = ChatOpenAI(temperature=0, model_name="gpt-4o")

llm.invoke("hello!")

AIMessage(content='Hello! How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 9, 'total_tokens': 19, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_523b9b6e5f', 'finish_reason': 'stop', 'logprobs': None}, id='run-f136012b-52dc-4130-8163-84af1f3a4957-0', usage_metadata={'input_tokens': 9, 'output_tokens': 10, 'total_tokens': 19, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [14]:

# @tool
# def text_to_aql_to_text(query: str):
#     """This tool is available to invoke the
#     ArangoGraphQAChain object, which enables you to
#     translate a Natural Language Query into AQL, execute
#     the query, and translate the result back into Natural Language.
#     """

#     llm = ChatOpenAI(temperature=0, model_name="gpt-4o")

#     chain = ArangoGraphQAChain.from_llm(
#     	llm=llm,
#     	graph=arango_graph,
#     	verbose=True,
#         allow_dangerous_requests=True
#     )
    
#     result = chain.invoke(query)

#     return str(result["result"])

In [15]:
@tool
def text_to_aql_to_text(query: str):
    """This tool is available to invoke the
    ArangoGraphQAChain object, which enables you to
    translate a Natural Language Query into AQL, execute
    the query, and translate the result back into Natural Language.
    """

    llm = ChatOpenAI(temperature=0, model_name="gpt-4o")

    # Add system message to guide AQL generation
    system_message = """You are an expert in ArangoDB and AQL query generation.
    When counting connections between heroes and comics:
    1. Use OUTBOUND or INBOUND instead of ANY to ensure consistent direction
    2. Use graph traversal (FOR v, e, p IN 1..1 OUTBOUND) instead of edge collection filtering
    3. Always verify node types in the traversal
    4. Use consistent naming conventions
    

    """

    chain = ArangoGraphQAChain.from_llm(
        llm=llm,
        graph=arango_graph,
        verbose=True,
        allow_dangerous_requests=True,
        system_message=system_message
    )
    
    result = chain.invoke(query)

    return str(result["result"])

In [16]:
import json
# Print schema in a readable format
print("\n=== Graph Schema ===")
print(json.dumps(arango_graph.schema, indent=2))



=== Graph Schema ===
{
  "Graph Schema": [
    {
      "graph_name": "SteamGraph",
      "edge_definitions": [
        {
          "edge_collection": "plays",
          "from_vertex_collections": [
            "Users"
          ],
          "to_vertex_collections": [
            "Games"
          ]
        }
      ]
    }
  ],
  "Collection Schema": [
    {
      "collection_name": "Users",
      "collection_type": "document",
      "document_properties": [
        {
          "name": "_key",
          "type": "str"
        },
        {
          "name": "_id",
          "type": "str"
        },
        {
          "name": "_rev",
          "type": "str"
        },
        {
          "name": "type",
          "type": "str"
        },
        {
          "name": "steamid",
          "type": "int"
        }
      ],
      "example_document": {
        "_key": "151603712",
        "_id": "Users/151603712",
        "_rev": "_jPaYbhm---",
        "type": "Users",
        "steamid": 151603

In [17]:
# 5. Define the Text to NetworkX/cuGraph Tool
# Note: It is encouraged to experiment and improve this section! This is just a placeholder:

@tool
def text_to_nx_algorithm_to_text(query):
    """This tool is available to invoke a NetworkX Algorithm on
    the ArangoDB Graph. You are responsible for accepting the
    Natural Language Query, establishing which algorithm needs to
    be executed, executing the algorithm, and translating the results back
    to Natural Language, with respect to the original query.

    If the query (e.g traversals, shortest path, etc.) can be solved using the Arango Query Language, then do not use
    this tool.
    """

    llm = ChatOpenAI(temperature=0, model_name="gpt-4o")

    ######################
    print("1) Generating NetworkX code")

    text_to_nx = llm.invoke(f"""
    I have a NetworkX Graph called `G_adb`. It has the following schema: {arango_graph.schema}

    I have the following graph analysis query: {query}.

    Generate the Python Code required to answer the query using the `G_adb` object.

    Be very precise on the NetworkX algorithm you select to answer this query. Think step by step.

    Only assume that networkx is installed, and other base python dependencies.

    Always set the last variable as `FINAL_RESULT`, which represents the answer to the original query.

    Only provide python code that I can directly execute via `exec()`. Do not provide any instructions.

    Make sure that `FINAL_RESULT` stores a short & consice answer. Avoid setting this variable to a long sequence.

    Your code:
    """).content

    text_to_nx_cleaned = re.sub(r"^```python\n|```$", "", text_to_nx, flags=re.MULTILINE).strip()
    
    print('-'*10)
    print(text_to_nx_cleaned)
    print('-'*10)

    ######################

    print("\n2) Executing NetworkX code")
    global_vars = {"G_adb": G_adb, "nx": nx}
    local_vars = {}

    try:
        exec(text_to_nx_cleaned, global_vars, local_vars)
        text_to_nx_final = text_to_nx
    except Exception as e:
        print(f"EXEC ERROR: {e}")
        return f"EXEC ERROR: {e}"

        # TODO: Consider experimenting with a code corrector!
        attempt = 1
        MAX_ATTEMPTS = 3

        # while attempt <= MAX_ATTEMPTS
            # ...

    print('-'*10)
    FINAL_RESULT = local_vars["FINAL_RESULT"]
    print(f"FINAL_RESULT: {FINAL_RESULT}")
    print('-'*10)

    ######################

    print("3) Formulating final answer")

    nx_to_text = llm.invoke(f"""
        I have a NetworkX Graph called `G_adb`. It has the following schema: {arango_graph.schema}

        I have the following graph analysis query: {query}.

        I have executed the following python code to help me answer my query:

        ---
        {text_to_nx_final}
        ---

        The `FINAL_RESULT` variable is set to the following: {FINAL_RESULT}.

        Based on my original Query and FINAL_RESULT, generate a short and concise response to
        answer my query.
        
        Your response:
    """).content

    return nx_to_text


In [18]:
tools = [text_to_aql_to_text,text_to_nx_algorithm_to_text]

In [19]:
def query_graph(query):
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o")
    app = create_react_agent(llm, tools)    
    final_state = app.invoke({"messages": [{"role": "user", "content": query}]})
    return final_state["messages"][-1].content

In [20]:
query_graph("which fames ar ethe most popular? from the graph")



[1m> Entering new ArangoGraphQAChain chain...[0m
AQL Query (1):[32;1m[1;3m
WITH Games, plays
FOR game IN Games
    LET playCount = LENGTH(
        FOR play IN plays
            FILTER play._to == game._id
            RETURN play
    )
    SORT playCount DESC
    RETURN { GameName: game.GameName, PlayCount: playCount }
[0m
AQL Result:
[32;1m[1;3m[{'GameName': 'Dota 2', 'PlayCount': 4841}, {'GameName': 'Team Fortress 2', 'PlayCount': 2323}, {'GameName': 'Counter-Strike Global Offensive', 'PlayCount': 1377}, {'GameName': 'Unturned', 'PlayCount': 1069}, {'GameName': 'Left 4 Dead 2', 'PlayCount': 801}, {'GameName': 'Counter-Strike Source', 'PlayCount': 715}, {'GameName': 'The Elder Scrolls V Skyrim', 'PlayCount': 677}, {'GameName': "Garry's Mod", 'PlayCount': 666}, {'GameName': 'Counter-Strike', 'PlayCount': 568}, {'GameName': "Sid Meier's Civilization V", 'PlayCount': 554}][0m

[1m> Finished chain.[0m


"The most popular games in the graph database, based on the number of times they have been played, are:\n\n1. **Dota 2** - 4,841 plays\n2. **Team Fortress 2** - 2,323 plays\n3. **Counter-Strike Global Offensive** - 1,377 plays\n4. **Unturned** - 1,069 plays\n5. **Left 4 Dead 2** - 801 plays\n6. **Counter-Strike Source** - 715 plays\n7. **The Elder Scrolls V Skyrim** - 677 plays\n8. **Garry's Mod** - 666 plays\n9. **Counter-Strike** - 568 plays\n10. **Sid Meier's Civilization V** - 554 plays"

In [21]:
def dummy_query_graph(query):
    # Simulate a query response
    return "Query processed successfully!"

def create_interface():
    with gradio.Blocks(title="Marvel Heroes Graph Explorer") as demo:
        # Header
        gradio.Markdown(
            """
            # Marvel Heroes Graph Explorer
            
            This interface allows you to explore the Marvel Heroes and Comics graph database using natural language queries.
            """
        )
        
        # Main interface
        with gradio.Row():
            with gradio.Column(scale=4):
                query_input = gradio.Textbox(
                    label="Enter your query",
                    placeholder="Example: Show me connections between heroes",
                    lines=3
                )
                with gradio.Row():
                    submit_btn = gradio.Button("Submit", variant="primary")
                    clear_btn = gradio.Button("Clear")
                
            with gradio.Column(scale=6):
                output = gradio.HTML(
                    label="Result"
                )
                
        # Query history
        with gradio.Accordion("Query History", open=False):
            history = gradio.HTML()
        
        def process_query(query, history_html):
            # Get text result
            result = dummy_query_graph(query)
            
            # Dummy data for visualization
            sample_data = {
                "nodes": [
                    {"id": 1, "name": "Spider-Man"},
                    {"id": 2, "name": "Iron Man"},
                    {"id": 3, "name": "Captain America"},
                    {"id": 4, "name": "Thor"},
                    {"id": 5, "name": "Black Widow"}
                ],
                "links": [
                    {"source": 1, "target": 2},
                    {"source": 2, "target": 3},
                    {"source": 1, "target": 3},
                    {"source": 4, "target": 2},
                    {"source": 3, "target": 4},
                    {"source": 5, "target": 2},
                    {"source": 5, "target": 3}
                ]
            }
            
            # Generate visualization HTML
            viz_html = create_d3_visualization(sample_data["nodes"], sample_data["links"])
            
            # Combine text result with visualization
            combined_result = f"""
            <div class="result-container">
                <div style="margin-bottom: 20px; padding: 10px; background-color: #f5f5f5; border-radius: 5px;">
                    <p><strong>Query Result:</strong> {result}</p>
                </div>
                {viz_html}
            </div>
            """
            
            new_history = f"<p><strong>Q:</strong> {query}<br><strong>A:</strong> {result}</p>" + history_html
            return combined_result, new_history 
        
        def clear_inputs():
            return "", ""
        
        submit_btn.click(
            process_query,
            inputs=[query_input, history],
            outputs=[output, history]
        )
        
        clear_btn.click(
            clear_inputs,
            inputs=[],
            outputs=[query_input, output]
        )
        
        query_input.submit(
            process_query,
            inputs=[query_input, history],
            outputs=[output, history]
        )
    
    return demo

# Launch the interface
create_interface().launch(share=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://113e6f9ec11ad1ca8d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


