In [212]:
from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser
from langchain.prompts import BaseChatPromptTemplate
from langchain.utilities import SerpAPIWrapper
from langchain.chains.llm import LLMChain
# from langchain.chat_models import ChatOpenAI
from langchain_openai.chat_models import ChatOpenAI
from typing import List, Union
from langchain_core.agents import AgentAction, AgentFinish

from langchain.schema import AgentAction, AgentFinish, HumanMessage
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain_core.messages import BaseMessage, HumanMessage
from typing import TypedDict, Annotated, List, Union


import operator

from langchain.tools import tool
from langchain import hub
from langgraph.graph import StateGraph, END
from langgraph.prebuilt.tool_executor import ToolExecutor

from langchain.pydantic_v1 import BaseModel, Field
from langchain.tools import BaseTool, StructuredTool, tool

from typing import (
    List,
    Tuple
)
import re

from RdfGraphCustom import RdfGraph
from smile_resolver import smiles_to_inchikey
from chemical_resolver import ChemicalResolver
from target_resolver import target_name_to_target_id
from taxon_resolver import TaxonResolver

from sparql import GraphSparqlQAChain
import os

from langchain.agents import  Tool
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"KGBot Testing - GPT4" #Please update the name here if you want to create a new project for separating the traces. 
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"

In [213]:
# Initialize model
endpoint_url = 'https://enpkg.commons-lab.org/graphdb/repositories/ENPKG'

graph = RdfGraph(
    query_endpoint=endpoint_url,
    standard="rdf")

print(graph.get_schema)

temperature = 0
model_id = "gpt-4" 

# https://api.python.langchain.com/en/latest/chat_models/langchain_community.chat_models.openai.ChatOpenAI.html?highlight=chatopenai#
llm = ChatOpenAI(temperature=temperature, 
                    model=model_id, # default is 'gpt-3.5-turbo'
                    max_retries=3,
                    verbose=True,
                    model_kwargs={
                        "top_p": 0.95,
                        },
                        streaming=True # fixme: remove 
                    )

sparql_chain = GraphSparqlQAChain.from_llm(
    llm, graph=graph, verbose=True
)

chem_res = ChemicalResolver.from_llm(llm=llm, verbose=True)
taxon_res = TaxonResolver()

class ChemicalInput(BaseModel):
    query: str = Field(description="natural product compound string")



class SparqlInput(BaseModel):
    # query: str = Field(description="strings containing for all entities, entity name and the corresponding entity identifier")
    query: str = Field(description="the original question from the user")
    entities: str = Field(description="strings containing for all entities, entity name and the corresponding entity identifier")


tools = [
    StructuredTool.from_function(
        name = "CHEMICAL_RESOLVER",
        func = chem_res.run,
        description="The function takes a natural product compound string as input and returns a InChIKey, if InChIKey not found, it returns the NPCClass, NPCPathway or NPCSuperClass.",
        args_schema=ChemicalInput,
    ),
    StructuredTool.from_function(
        name = "TAXON_RESOLVER",
        func=taxon_res.query_wikidata,
        description="The function takes a taxon string as input and returns the wikidata ID.",
    ),
    StructuredTool.from_function(
        name = "TARGET_RESOLVER",
        func=target_name_to_target_id,
        description="The function takes a target string as input and returns the ChEMBLTarget IRI.",
    ),
    StructuredTool.from_function(
        name = "SMILE_CONVERTER",
        func=smiles_to_inchikey,
        description="The function takes a SMILES string as input and returns the InChIKey notation of the molecule.",
    ),
    StructuredTool.from_function(
        name = "SPARQL_QUERY_RUNNER",
        func=sparql_chain.run,
        description="The agent resolve the user's question by querying the knowledge graph database. Input should be a question and the resolved entities in the question.",
        args_schema=SparqlInput,
        # return_direct=True,
    ),
]


tool_names = [tool.name for tool in tools]



list of namespaces []
identifier , Nf8012c900e2147338dff108a10eb7e99
query PREFIX brick: <https://brickschema.org/schema/Brick#>
PREFIX csvw: <http://www.w3.org/ns/csvw#>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
PREFIX dcmitype: <http://purl.org/dc/dcmitype/>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX dcam: <http://purl.org/dc/dcam/>
PREFIX doap: <http://usefulinc.com/ns/doap#>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX geo: <http://www.opengis.net/ont/geosparql#>
PREFIX odrl: <http://www.w3.org/ns/odrl/2/>
PREFIX org: <http://www.w3.org/ns/org#>
PREFIX prof: <http://www.w3.org/ns/dx/prof/>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX qb: <http://purl.org/linked-data/cube#>
PREFIX schema: <https://schema.org/>
PREFIX sh: <http://www.w3.org/ns/shacl#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX sosa: <http://www.w3.org/ns/sosa/>
PREFIX ssn: <http://www.w3.org/ns/ssn/>
PREFIX time: <http://www.w3.org/2006/tim

(<http://www.w3.org/1999/02/22-rdf-syntax-ns#Alt does not look like a valid URI, trying to serialize this will break.
(<http://www.w3.org/1999/02/22-rdf-syntax-ns#Alt>, < does not look like a valid URI, trying to serialize this will break.
(<http://www.w3.org/1999/02/22-rdf-syntax-ns#Alt>, <http://www.w3.org/2000/01/rdf-schema#Container does not look like a valid URI, trying to serialize this will break.
(<http://www.w3.org/1999/02/22-rdf-syntax-ns#Alt>, <http://www.w3.org/2000/01/rdf-schema#Container does not look like a valid URI, trying to serialize this will break.
(<http://www.w3.org/1999/02/22-rdf-syntax-ns#Alt>, <http://www.w3.org/2000/01/rdf-schema#Container>) does not look like a valid URI, trying to serialize this will break.
(<http://www.w3.org/1999/02/22-rdf-syntax-ns#Bag does not look like a valid URI, trying to serialize this will break.
(<http://www.w3.org/1999/02/22-rdf-syntax-ns#Bag>, < does not look like a valid URI, trying to serialize this will break.
(<http://www.w

query PREFIX brick: <https://brickschema.org/schema/Brick#>
PREFIX csvw: <http://www.w3.org/ns/csvw#>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
PREFIX dcmitype: <http://purl.org/dc/dcmitype/>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX dcam: <http://purl.org/dc/dcam/>
PREFIX doap: <http://usefulinc.com/ns/doap#>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX geo: <http://www.opengis.net/ont/geosparql#>
PREFIX odrl: <http://www.w3.org/ns/odrl/2/>
PREFIX org: <http://www.w3.org/ns/org#>
PREFIX prof: <http://www.w3.org/ns/dx/prof/>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX qb: <http://purl.org/linked-data/cube#>
PREFIX schema: <https://schema.org/>
PREFIX sh: <http://www.w3.org/ns/shacl#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX sosa: <http://www.w3.org/ns/sosa/>
PREFIX ssn: <http://www.w3.org/ns/ssn/>
PREFIX time: <http://www.w3.org/2006/time#>
PREFIX vann: <http://purl.org/vocab/vann/>
PREFIX void: <http://r

(<http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral>, < does not look like a valid URI, trying to serialize this will break.
(<http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral>, <http://www.w3.org/2000/01/rdf-schema#Literal does not look like a valid URI, trying to serialize this will break.
(<http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral>, <http://www.w3.org/2000/01/rdf-schema#Literal does not look like a valid URI, trying to serialize this will break.
(<http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral>, <http://www.w3.org/2000/01/rdf-schema#Literal>) does not look like a valid URI, trying to serialize this will break.
(<http://www.w3.org/2000/01/rdf-schema#Datatype does not look like a valid URI, trying to serialize this will break.
(<http://www.w3.org/2000/01/rdf-schema#Datatype>, < does not look like a valid URI, trying to serialize this will break.
(<http://www.w3.org/2000/01/rdf-schema#Datatype>, <http://www.w3.org/2000/01/rdf-schema#Class does not look like

The namespace prefixes are:
brick: https://brickschema.org/schema/Brick#, csvw: http://www.w3.org/ns/csvw#, dc: http://purl.org/dc/elements/1.1/, dcat: http://www.w3.org/ns/dcat#, dcmitype: http://purl.org/dc/dcmitype/, dcterms: http://purl.org/dc/terms/, dcam: http://purl.org/dc/dcam/, doap: http://usefulinc.com/ns/doap#, foaf: http://xmlns.com/foaf/0.1/, geo: http://www.opengis.net/ont/geosparql#, odrl: http://www.w3.org/ns/odrl/2/, org: http://www.w3.org/ns/org#, prof: http://www.w3.org/ns/dx/prof/, prov: http://www.w3.org/ns/prov#, qb: http://purl.org/linked-data/cube#, schema: https://schema.org/, sh: http://www.w3.org/ns/shacl#, skos: http://www.w3.org/2004/02/skos/core#, sosa: http://www.w3.org/ns/sosa/, ssn: http://www.w3.org/ns/ssn/, time: http://www.w3.org/2006/time#, vann: http://purl.org/vocab/vann/, void: http://rdfs.org/ns/void#, wgs: https://www.w3.org/2003/01/geo/wgs84_pos#, owl: http://www.w3.org/2002/07/owl#, rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#, rdfs: htt

In [214]:
instructions = {

    "Step 1": """
        If the question ask anything about any entities that could be natural product compound, find the relevant IRI to this chemical class using CHEMICAL_RESOLVER. Input is the chemical class name.
        """,

    "Step 2": """
        If a taxon is mentionned, find what is its wikidata IRI with TAXON_RESOLVER. Input is the taxon name.
        """,

    "Step 3": """
        If a target is mentionned, find the ChEMBLTarget IRI of the target with TARGET_RESOLVER. Input is the target name.
        """,
        
    "Step 4": """
        If a SMILE structure is mentionned, find what is the InChIKey notation of the molecule with SMILE_CONVERTER. Input is the SMILE structure.
                """,

    "step 5": """
        Give me units relevant to numerical values in this question. Return nothing if units for value is not provided.
        Be sure to say that these are the units of the quantities found in the knowledge graph.
        Here is the list of units to find:
            "retention time": "minutes",
            "activity value": null, 
            "feature area": "absolute count or intensity", 
            "relative feature area": "normalized area in percentage", 
            "parent mass": "ppm (parts-per-million) for m/z",
            "mass difference": "delta m/z", 
            "cosine": "score from 0 to 1. 1 = identical spectra. 0 = completely different spectra"
        """,
    "step 6": """
        Use SPARQL_QUERY_RUNNER tool to run a SPARQL query on the knowledge graph. Input contains the user question + all the resolved entities and units found in previous steps.
    """,
    "step 7": """
        Repeat all steps from begining if no results, but try to self-correct first. Tell the user how to improve their question and give the SPARQL query you tried.
    """,
    "step 8": """
        Give the answer to the user.
    """,
}
template = """You are trying to answer a question using the Experiment Natural Products Knowledge Graph (ENPKG).
You have access to the following tools:

{tool_names}

Here are your instructions:

{instructions}


Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

These were previous tasks you completed:



Begin!
"""

In [215]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

system_message = """You are an entity resolution agent for the SPARQL_QUERY_RUNNER.
You have access to the following tools:
{tool_names}

If the question ask anything about any entities that could be natural product compound, find the relevant IRI to this chemical class using CHEMICAL_RESOLVER. Input is the chemical class name.

If a taxon is mentionned, find what is its wikidata IRI with TAXON_RESOLVER. Input is the taxon name.

If a target is mentionned, find the ChEMBLTarget IRI of the target with TARGET_RESOLVER. Input is the target name.

If a SMILE structure is mentionned, find what is the InChIKey notation of the molecule with SMILE_CONVERTER. Input is the SMILE structure.
        
Give me units relevant to numerical values in this question. Return nothing if units for value is not provided.
Be sure to say that these are the units of the quantities found in the knowledge graph.
Here is the list of units to find:
    "retention time": "minutes",
    "activity value": null, 
    "feature area": "absolute count or intensity", 
    "relative feature area": "normalized area in percentage", 
    "parent mass": "ppm (parts-per-million) for m/z",
    "mass difference": "delta m/z", 
    "cosine": "score from 0 to 1. 1 = identical spectra. 0 = completely different spectra"


Use SPARQL_QUERY_RUNNER tool to answer the question. Input contains the user question + the list of tuples of strings of the resolved entities and units found in previous steps.

If no results tell the user how to improve their question and give the SPARQL query that have been returned by the SPARQL_QUERY_RUNNER.

Give the answer to the user.
        
""".format(tool_names="\n".join(tool_names))
# system_message = "You are a helpful assistant"

template = template.format(
    tool_names="\n".join(tool_names),
    instructions= "\n".join([str(key)+": "+value for key, value in instructions.items()]),
    
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_message),
        MessagesPlaceholder("chat_history", optional=True),
        ("human", "{input}"),
        MessagesPlaceholder("agent_scratchpad"),
    ]
)

prompt.pretty_print()


You are an entity resolution agent for the SPARQL_QUERY_RUNNER.
You have access to the following tools:
CHEMICAL_RESOLVER
TAXON_RESOLVER
TARGET_RESOLVER
SMILE_CONVERTER
SPARQL_QUERY_RUNNER

If the question ask anything about any entities that could be natural product compound, find the relevant IRI to this chemical class using CHEMICAL_RESOLVER. Input is the chemical class name.

If a taxon is mentionned, find what is its wikidata IRI with TAXON_RESOLVER. Input is the taxon name.

If a target is mentionned, find the ChEMBLTarget IRI of the target with TARGET_RESOLVER. Input is the target name.

If a SMILE structure is mentionned, find what is the InChIKey notation of the molecule with SMILE_CONVERTER. Input is the SMILE structure.
        
Give me units relevant to numerical values in this question. Return nothing if units for value is not provided.
Be sure to say that these are the units of the quantities found in the knowledge graph.
Here is the list of units to find:
    "retention

In [216]:
from langchain.agents import create_openai_functions_agent
agent_runnable = create_openai_functions_agent(llm=llm, tools=tools, prompt= prompt)
class AgentState(TypedDict):
   # The input string
   input: str
   # The list of previous messages in the conversation
   chat_history: list[BaseMessage]
   # The outcome of a given call to the agent
   # Needs `None` as a valid 
   # 
   # type, since this is what this will start as
   agent_outcome: Union[AgentAction, AgentFinish, None]
   # List of actions and corresponding observations
   # Here we annotate this with `operator.add` to indicate that operations to
   # this state should be ADDED to the existing values (not overwrite it)
   intermediate_steps: Annotated[list[tuple[AgentAction, str]], operator.add]


tool_executor = ToolExecutor(tools)

# Define the agent
def run_agent(data):
    agent_outcome = agent_runnable.invoke(data)
    return {"agent_outcome": agent_outcome}



In [217]:
# Define the function to execute tools
def execute_tools(data):
    # Get the most recent agent_outcome - this is the key added in the `agent` above
    agent_action = data['agent_outcome']
    output = tool_executor.invoke(agent_action)
    return {"intermediate_steps": [(agent_action, str(output))]}
# Define logic that will be used to determine which conditional edge to go down
def should_continue(data):
    # If the agent outcome is an AgentFinish, then we return `exit` string
    # This will be used when setting up the graph to define the flow
    if isinstance(data['agent_outcome'], AgentFinish):
        return "end"
    # Otherwise, an AgentAction is returned
    # Here we return `continue` string
    # This will be used when setting up the graph to define the flow
    else:
        return "continue"
    

# Define the graph

In [218]:
# Define a new graph
workflow = StateGraph(AgentState)

# Define the two nodes we will cycle between
workflow.add_node("agent", run_agent)
workflow.add_node("action", execute_tools)

# Set the entrypoint as `agent`
# This means that this node is the first one called
workflow.set_entry_point("agent")

# We now add a conditional edge
workflow.add_conditional_edges(
    # First, we define the start node. We use `agent`.
    # This means these are the edges taken after the `agent` node is called.
    "agent",
    # Next, we pass in the function that will determine which node is called next.
    should_continue,
    # Finally we pass in a mapping.
    # The keys are strings, and the values are other nodes.
    # END is a special node marking that the graph should finish.
    # What will happen is we will call `should_continue`, and then the output of that
    # will be matched against the keys in this mapping.
    # Based on which one it matches, that node will then be called.
    {
        # If `tools`, then we call the tool node.
        "continue": "action",
        # Otherwise we finish.
        "end": END
    }
)


# We now add a normal edge from `tools` to `agent`.
# This means that after `tools` is called, `agent` node is called next.
workflow.add_edge('action', 'agent')

# Finally, we compile it!
# This compiles it into a LangChain Runnable,
# meaning you can use it as you would any other runnable
app = workflow.compile()

In [219]:
workflow.channels

{'input': <langgraph.channels.last_value.LastValue at 0x2e06cc690>,
 'chat_history': <langgraph.channels.last_value.LastValue at 0x2c0a909d0>,
 'agent_outcome': <langgraph.channels.last_value.LastValue at 0x2e06cc890>,
 'intermediate_steps': <langgraph.channels.binop.BinaryOperatorAggregate at 0x2e06cf050>}

In [220]:

q1 = 'How many features (pos ionization and neg ionization modes) have the same SIRIUS/CSI:FingerID and ISDB annotation by comparing the InCHIKey of the annotations?'
q1_bis = 'How many features (pos ionization and neg ionization modes) have the same SIRIUS/CSI:FingerID and ISDB annotation by comparing the InCHIKey2D of the annotations?'
q2 = 'Which extracts have features (pos ionization mode) annotated as the class, aspidosperma-type alkaloids, by CANOPUS with a probability score above 0.5, ordered by the decresing count of features as aspidosperma-type alkaloids? Group by extract.'
q3 = 'Among the structural annotations from the Tabernaemontana coffeoides (Apocynaceae) seeds extract taxon , which ones contain an aspidospermidine substructure, CCC12CCCN3C1C4(CC3)C(CC2)NC5=CC=CC=C45?'
q4 = 'Among the SIRIUS structural annotations from the Tabernaemontana coffeoides (Apocynaceae) seeds extract taxon, which ones are reported in the Tabernaemontana genus in Wikidata?'
q5 = 'Which compounds have annotations with chembl assay results indicating reported activity against T. cruzi by looking at the cosmic, zodiac and taxo scores?'
q6 = 'Filter the pos ionization mode features of the Melochia umbellata taxon annotated as [M+H]+ by SIRIUS to keep the ones for which a feature in neg ionization mode is detected with the same retention time (+/- 3 seconds) and a mass corresponding to the [M-H]- adduct (+/- 5ppm).'
q7 = 'For features from the Melochia umbellata taxon in pos ionization mode with SIRIUS annotations, get the ones for which a feature in neg ionization mode with the same retention time (+/- 3 seconds) has the same SIRIUS annotation by comparing the InCHIKey 2D. Return the features, retention times, and InChIKey2D'
q8 = "Which features were annotated as 'Tetraketide meroterpenoids' by SIRIUS, and how many such features were found for each species and plant part?"
q9 = "What are all distinct submitted taxons for the extracts in the knowledge graph?"
q10 = "What are the taxons, lab process and label (if one exists) for each sample? Sort by sample and then lab process"
q11 = "Count all the species per family in the collection"


In [221]:
inputs = {"input": q7, "chat_history": []}
# inputs = {"input": "give me a random number and then write in words and make it lower case", "chat_history": []}


for s in app.stream(inputs):
    print(list(s.values())[0])
    print("----")
# app.invoke(inputs)


{'agent_outcome': AgentActionMessageLog(tool='TAXON_RESOLVER', tool_input={'taxon_name': 'Melochia umbellata'}, log="\nInvoking: `TAXON_RESOLVER` with `{'taxon_name': 'Melochia umbellata'}`\n\n\n", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\n  "taxon_name": "Melochia umbellata"\n}', 'name': 'TAXON_RESOLVER'}})])}
----
{'intermediate_steps': [(AgentActionMessageLog(tool='TAXON_RESOLVER', tool_input={'taxon_name': 'Melochia umbellata'}, log="\nInvoking: `TAXON_RESOLVER` with `{'taxon_name': 'Melochia umbellata'}`\n\n\n", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\n  "taxon_name": "Melochia umbellata"\n}', 'name': 'TAXON_RESOLVER'}})]), 'wikidata IRI is http://www.wikidata.org/entity/Q6813281')]}
----
{'agent_outcome': AgentActionMessageLog(tool='SPARQL_QUERY_RUNNER', tool_input={'query': 'For features from the Melochia umbellata taxon in pos ionization mode with SIRIUS annotations, get the ones for 