In [1]:
import os
import yaml
import pprint
import re

from IPython.display import display, HTML
from dotenv import load_dotenv
from pathlib import Path

In [2]:
from biochatter.llm_connect import GptConversation
from biochatter.kg_langgraph_agent import KGQueryReflexionAgent
from biochatter.prompts import BioCypherPromptEngine
from biochatter.query_interaction import BioCypherQueryHandler

In [3]:
def color_str(s, color='black'):
    return '<p style="color:{}; ">{}</p>'.format(color, s)

def code_str(s):
    return '<code style="background-color: #AAAAAA; padding: 5px;">{}</code>'.format(s)

def print_cypher_and_answer(text: str) -> HTML:
    q = code_str(cypher_query.answer)
    r = color_str(cypher_query.tool_result, "green") 
    return HTML(f'<div style="padding: 5px; border: 1px solid black;"><h4>Cypher code:</h4>{q}<br><h4>KG result:</h4>{r}</div>')


In [4]:
dotenv_path = Path('../app.env')
load_dotenv(dotenv_path=dotenv_path)

True

In [12]:
schema_dict = yaml.safe_load(Path("../config/schema_config.yaml").read_text())

In [19]:
def create_conversation():
    conversation = GptConversation(model_name="gpt-4o", prompts={})
    conversation.set_api_key(os.getenv("OPENAI_API_KEY"), "SKM-BioChatter")
    return conversation

neo4j_connection_args = {
    "host": "localhost",
    "port": "7687",
}

agent = KGQueryReflexionAgent(
    connection_args=neo4j_connection_args,
    conversation_factory=create_conversation,
)

In [20]:
prompt_engine = BioCypherPromptEngine(
    model_name="gpt-3.5-turbo",
    schema_config_or_info_dict=schema_dict,  # the schema definition of our graph
    conversation_factory=create_conversation,
)

### Which functional cluster does the gene NPR1 belong to?

In [8]:
question = 'Which functional cluster does the gene NPR1 belong to?'
kg_prompt = prompt_engine.generate_query_prompt(question)
HTML(kg_prompt)

In [9]:
cypher_query = agent.execute(question, kg_prompt)
print_cypher_and_answer(cypher_query)

content='' additional_kwargs={'tool_calls': [{'id': 'call_xj9wEWVvwvjTCoFbudxQ0qU2', 'function': {'a ...
content='{"query": "MATCH (g:Gene {name: \'NPR1\'})-[:GeneBelongsToFunctionalCluster]->(fc:Functiona ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_hr72SNF3xXAsFl2OgAUqYBDw', 'function': {'a ...


### What is the TAIR identifier of MYC2?

In [10]:
question = 'What is the TAIR identifier of the gene MYC2?'
kg_prompt = prompt_engine.generate_query_prompt(question)
HTML(kg_prompt)

In [11]:
cypher_query = agent.execute(question, kg_prompt)
print_cypher_and_answer(cypher_query)

content='' additional_kwargs={'tool_calls': [{'id': 'call_47rjLlen6SwF2touYee9YPba', 'function': {'a ...
content='{"query": "MATCH (g:Gene {name: \'MYC2\'})-[:GeneBelongsToFunctionalCluster]->(fc:Functiona ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_qdgCpQfbK3yn6ZoljalY6jb5', 'function': {'a ...
content='{"query": "MATCH (g:Gene {name: \'MYC2\'})-[:GeneBelongsToFunctionalCluster]->(fc) RETURN f ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_dcH0whELPQXGTllwr8drCQQ5', 'function': {'a ...
content='{"query": "MATCH (g:Gene {name: \'MYC2\'})-[:GeneBelongsToFunctionalCluster]->(fc) RETURN f ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_dBytRlbGvOz9CQJYQPPNXAfx', 'function': {'a ...
content='{"query": "MATCH (g:Gene {name: \'MYC2\'})-[:GeneBelongsToFunctionalCluster]->(fc) RETURN f ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_w6QjdTCgmkdhYnVKWrOH2zjQ', 'function': {'a ...
content='{"query": "MATCH (g:Gene {name: \'MYC2\'})-[:G

### Which functional cluster does the gene with tair identifier AT1G64280 belong to?

In [10]:
question = 'Which functional cluster does the gene with tair identifier AT1G64280 belong to?'
kg_prompt = prompt_engine.generate_query_prompt(question)
HTML(kg_prompt)

In [11]:
cypher_query = agent.execute(question, kg_prompt)
print_cypher_and_answer(cypher_query)

content='' additional_kwargs={'tool_calls': [{'id': 'call_n5xrzrnkUK54CiJvewXXSvSN', 'function': {'a ...
content='{"query": "MATCH (g:Gene {tair: \'AT1G64280\'})-[:GeneToFunctionalCluster]->(fc:FunctionalC ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_soJvmTKXxycoZrPEeEmmJWoD', 'function': {'a ...


### In which pathways is the phytohormone ET involved?

In [35]:
question = 'In which pathways is ET involved?'
kg_prompt = prompt_engine.generate_query_prompt(question)
HTML(kg_prompt)

In [36]:
cypher_query = agent.execute(question, kg_prompt)
print_cypher_and_answer(cypher_query)

content='' additional_kwargs={'tool_calls': [{'id': 'call_x6QTCSymebhsZCjeRIcYWIQr', 'function': {'a ...
content='{"query": "MATCH (m:Metabolite {name: \'ET\'})-[:InPathway]->(p:Pathway) RETURN p.name LIMI ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_vFrPnBADSV6kXxCgx3tpmlyp', 'function': {'a ...
content='{"query": "MATCH (m:Metabolite {name: \'ET\'})-->(p:Pathway) RETURN p.name LIMIT 30", "resu ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_1ltBS5miYPN032W7qEb80IBa', 'function': {'a ...
content='{"query": "MATCH (m:Metabolite {name: \'ET\'})--(p:Pathway) RETURN p.name LIMIT 30", "resul ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_NejsmEelvKTQfC7tyI78aAPD', 'function': {'a ...
content='{"query": "MATCH (m:Metabolite)-[r]-(p:Pathway) WHERE m.name CONTAINS \'ET\' RETURN p.name  ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_sBluAEtRaIyWEBoIvZ6yywip', 'function': {'a ...
content='{"query": "MATCH (m:Metabolite)-[r]-(p:Pathway

### Which functional clusters does MYC2 interact with?


In [42]:
# question = 'Which functional clusters does MYC2 interact with?'
question = 'Which other functional clusters does the MYC2 functional cluster have a molecular interaction with?'
kg_prompt = prompt_engine.generate_query_prompt(question)
HTML(kg_prompt)

In [43]:
cypher_query = agent.execute(question, kg_prompt)
print_cypher_and_answer(cypher_query)

content='' additional_kwargs={'tool_calls': [{'id': 'call_0BXDnCedIfvGkB2PeQHeHhy4', 'function': {'a ...
content='{"query": "MATCH (fc1:FunctionalCluster {name: \'MYC2\'})-[:TranscriptionalInhibition|:Tran ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_gIlLPgR2DdDTTWHuUGPW37Sg', 'function': {'a ...


### Which genes does MYC2 transcriptionally regulate?

In [68]:
question = 'Which genes does MYC2 transcriptionally regulate?'
kg_prompt = prompt_engine.generate_query_prompt(question)
HTML(kg_prompt)

In [69]:
cypher_query = agent.execute(question, kg_prompt)
print_cypher_and_answer(cypher_query)

content='' additional_kwargs={'tool_calls': [{'id': 'call_gAbAXTc7v3N1sEtBiDQj7fof', 'function': {'a ...
content='{"query": "MATCH (g:Gene)-[:TranscriptionalActivation]->(:Gene {name: \'MYC2\'}) RETURN g.n ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_qT9HHGmxvJ4GjSN0AbKHQpKF', 'function': {'a ...


### What proteins regulate MYC2 transcription?

In [66]:
question = 'What proteins regulate MYC2 transcription?'
kg_prompt = prompt_engine.generate_query_prompt(question)
HTML(kg_prompt)

In [67]:
cypher_query = agent.execute(question, kg_prompt)
print_cypher_and_answer(cypher_query)

content='' additional_kwargs={'tool_calls': [{'id': 'call_QVSWJ0g5wCiAOQ4tbDLQhrgX', 'function': {'a ...
content='{"query": "MATCH (regulator)-[:TranscriptionalActivation]->(target:Gene {name: \'MYC2\'}) R ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_U5Ioj1hhzPu6vS9HdswGptuR', 'function': {'a ...
content='{"query": "MATCH (regulator)-[:RegulatesTranscription]->(target:Gene {name: \'MYC2\'}) RETU ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_abCXIbWPiyPQTg2WIpEyuLbS', 'function': {'a ...
content='{"query": "MATCH (regulator)-[:RegulatesTranscription]->(target:Gene {name: \'MYC2\'}) RETU ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_TshbWFbJTmSAaUmRcjKtL329', 'function': {'a ...
content='{"query": "MATCH (regulator)-[:RegulatesTranscription]->(target:Gene {name: \'MYC2\'}) RETU ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_oSJzCJ6htGIixoLS6G6z3RnZ', 'function': {'a ...
content='{"query": "MATCH (regulator)-[:RegulatesTransc

### What proteins regulate negatively MYC2 transcription?

In [25]:
question = 'What proteins regulate negatively MYC2 transcription?'
kg_prompt = prompt_engine.generate_query_prompt(question)
HTML(kg_prompt)

In [26]:
cypher_query = agent.execute(question, kg_prompt)
print_cypher_and_answer(cypher_query)

content='' additional_kwargs={'tool_calls': [{'id': 'call_O46TMD1C9qPAPQvsy7KpOCnr', 'function': {'a ...
content='{"query": "MATCH (g:Gene)-[:TranscriptionalInhibition]->(t:Gene) WHERE t.name = \'MYC2\' RE ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_v0fdqWkd1oQUNfX7D50LlaMx', 'function': {'a ...


# Interact with response

In [26]:
relationships = {
    'GeneBelongsToFunctionalCluster': {'source': 'functional cluster', 'target': 'gene'},
    'TranscriptionalInhibition':  {'source': 'functional cluster', 'target': 'functional cluster'},
    'TranscriptionalActivation':  {'source': 'functional cluster', 'target': 'functional cluster'} 
}
entities = ["Gene", "FunctionalCluster"]

properties = {"Gene": ["tair"], "FunctionalCluster":["name"]}

### What proteins regulate negatively ORA59 transcription?

In [56]:
question = 'What proteins negatively regulate ORA59 transcription?'
conversation = create_conversation()

In [64]:
kg_prompt = prompt_engine.generate_query_prompt(question)
HTML(kg_prompt)

In [65]:
cypher_query = agent.execute(question, kg_prompt)
print_cypher_and_answer(cypher_query)

content='' additional_kwargs={'tool_calls': [{'id': 'call_SkRnnRJ4JBj88w3s8sKucYhV', 'function': {'a ...
content='{"query": "MATCH (g:Gene)-[:TranscriptionalInhibition]->(:FunctionalCluster {name: \'ORA59\ ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_Uq9JWHOfPCThOoWwzcv7ugVJ', 'function': {'a ...
content='{"query": "MATCH (g:Gene)-[:TranscriptionalInhibition]->(f:FunctionalCluster) WHERE f.name  ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_zrlLFRzPgr1sOGRfMa1ZcXUF', 'function': {'a ...
content='{"query": "MATCH (g:Gene)-[:TranscriptionalRegulation]->(f:FunctionalCluster) WHERE f.name  ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_4UZVBkQaR5ayJD6v8veGBxBB', 'function': {'a ...
content='{"query": "MATCH (g:Gene)-[:Regulates]->(f:FunctionalCluster {name: \'ORA59\'}) RETURN g.na ...
content='' additional_kwargs={'tool_calls': [{'id': 'call_Je2JaK5zgNaJSZMwkHOaxgUD', 'function': {'a ...
content='{"query": "MATCH (g:Protein)-[:Interacts_With]

In [63]:
query = prompt_engine._generate_query(
    question=question,
    entities=entities,
    relationships=relationships,
    properties=properties,
    query_language="Cypher",
    conversation=conversation
)
print(query)

MATCH (:Gene)-[:TranscriptionalInhibition]->(:Gene {tair: "ORA59"}) RETURN DISTINCT Gene;


In [59]:
query_handler = BioCypherQueryHandler(
    query=query,
    query_lang="Cypher",
    kg_selected={
        'entities': ["Gene", "FunctionalCluster"],
        'relationships': relationships,
        'properties': {"Gene": ["tair"]}
    },
    question=question
)

In [60]:
explanation = query_handler.explain_query()
print(explanation)

This query matches genes that are connected to ORA59 through a TranscriptionalInhibition relationship and returns the distinct genes that negatively regulate ORA59 transcription.


In [61]:
request_update = "The result should be a list of tair identifiers."
new_query = query_handler.update_query(request_update)
print(new_query)

MATCH (g:Gene)-[:TranscriptionalInhibition]->(:Gene {tair: "ORA59"}) 
RETURN DISTINCT g.tair;


In [62]:
explanation = query_handler.explain_query()
print(explanation)

This query matches genes that are connected to ORA59 through a TranscriptionalInhibition relationship and returns the distinct genes that negatively regulate ORA59 transcription.


### Which genes does MYC2 transcriptionally regulate?

In [21]:
question = "Which nodes does MYC2 transcriptionally regulate?"
conversation = create_conversation()

In [22]:
query = prompt_engine._generate_query(
    question=question,
    entities=entities,
    relationships=relationships,
    properties=properties,
    query_language="Cypher",
    conversation=conversation
)
print(query)

MATCH (fc1:FunctionalCluster)-[:TranscriptionalActivation|:TranscriptionalInhibition]->(fc2:FunctionalCluster)
WHERE fc1.name = 'MYC2'
RETURN fc2


In [27]:
question = "Which nodes does AT1G32640 transcriptionally regulate?"
conversation = create_conversation()

In [28]:
query = prompt_engine._generate_query(
    question=question,
    entities=entities,
    relationships=relationships,
    properties=properties,
    query_language="Cypher",
    conversation=conversation
)
print(query)

MATCH (g:Gene {tair: 'AT1G32640'})-[:GeneBelongsToFunctionalCluster]->(fc:FunctionalCluster)-[:TranscriptionalActivation|:TranscriptionalInhibition]->(targetFC:FunctionalCluster)<-[:GeneBelongsToFunctionalCluster]-(targetGene:Gene)
RETURN targetGene


In [27]:
question = "Which nodes does AT1G32640 transcriptionally regulate?"
conversation = create_conversation()

In [28]:
query = prompt_engine._generate_query(
    question=question,
    entities=entities,
    relationships=relationships,
    properties=properties,
    query_language="Cypher",
    conversation=conversation
)
print(query)

MATCH (g:Gene {tair: 'AT1G32640'})-[:GeneBelongsToFunctionalCluster]->(fc:FunctionalCluster)-[:TranscriptionalActivation|:TranscriptionalInhibition]->(targetFC:FunctionalCluster)<-[:GeneBelongsToFunctionalCluster]-(targetGene:Gene)
RETURN targetGene


In [23]:
query_handler = BioCypherQueryHandler(
    query=query,
    query_lang="Cypher",
    kg_selected={
        'entities': ["Gene", "FunctionalCluster"],
        'relationships': relationships,
        'properties': {"Gene": ["tair"]}
    },
    question=question
)

In [18]:
request_update = """
First find the functional cluster the gene belongs. 
"""
new_query = query_handler.update_query(request_update)
print(new_query)

MATCH (g:Gene {database: 'tair'})-[:GeneToFunctionalCluster]->(fc:FunctionalCluster)
WHERE fc.name = "MYC2"
RETURN g, fc


In [26]:
query_handler.explain_query()

'This query retrieves genes that are transcriptionally activated by the functional cluster named "MYC2".'