In [1]:
#remove
#from dotenv import load_dotenv
import os

# Load the environment variables from the .env file
#load_dotenv()

In [2]:
# langchain imports for agent and prompt handling
from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser 
from langchain.prompts import BaseChatPromptTemplate
from langchain.utilities import SerpAPIWrapper
from langchain.chains.llm import LLMChain
from langchain_openai import ChatOpenAI
from langchain.agents import create_openai_tools_agent
from langchain import hub

# langgraph imports for prebuilt tool invocation
from langgraph.prebuilt import ToolInvocation
from langgraph.graph import StateGraph, END

# langchain_core imports for message handling and action schema
from langchain_core.messages import BaseMessage, HumanMessage, FunctionMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder 
from langchain.schema import AgentAction, AgentFinish, HumanMessage

# langchain output parser for OpenAI functions
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser

# typing imports for type hinting
from typing import Annotated, List, Tuple, Union,  Any, Dict, Optional, Sequence, TypedDict
import operator
import functools


# Standard library imports for JSON and regular expressions
import json
import re

# Custom imports for RDF graph manipulation and chemical, target, taxon, and SPARQL resolution
from RdfGraphCustom import RdfGraph
from smile_resolver import smiles_to_inchikey
from chemical_resolver import ChemicalResolver
from target_resolver import target_name_to_target_id
from taxon_resolver import TaxonResolver
from sparql import GraphSparqlQAChain
from custom_sqlite_streamlit import SqliteSaver

# langchain pydantic for base model definitions
from langchain.pydantic_v1 import BaseModel, Field

# langchain tools for base, structured tool definitions, and tool decorators
from langchain.tools import BaseTool, StructuredTool, tool

# Standard library import for object serialization
import pickle

In [3]:
#remove
# #Setting up the LangSmith
# #For now, all runs will be stored in the "KGBot Testing - GPT4"
# #If you want to separate the traces to have a better control of specific traces.
# #Metadata as llm version and temperature can be obtaneid from traces. 
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"KGBot Testing - GPT4" #Please update the name here if you want to create a new project for separating the traces. 
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
from langsmith import Client
client = Client()

# #Check if the client was initialized
print(client)

Client (API URL: https://api.smith.langchain.com)


In [4]:
####################### Instantiate the graph #######################

endpoint_url = 'https://enpkg.commons-lab.org/graphdb/repositories/ENPKG' #remove
#create function put to main
# init with the endpoint url
graph = RdfGraph(
    query_endpoint=endpoint_url,
    standard="rdf")

# #OR

# graph = RdfGraph(
#     query_endpoint=endpoint_url,
#     standard="rdf",
#     schema_file='../graphs/schema.ttl')


# save the graph on disk
with open('../graphs/graph.pkl', 'wb') as output_file:
    pickle.dump(graph, output_file)
    

####################### Load the graph from disk #######################
    
# # load the graph from disk
# with open('../graphs/graph.pkl', 'rb') as input_file:
#     graph = pickle.load(input_file)
    
print(graph.get_schema)


2024-03-07 15:13:39,071 - INFO - query 
    SELECT DISTINCT ?cls ?com ?label
        WHERE {
            ?cls a rdfs:Class . 
            OPTIONAL { ?cls rdfs:comment ?com }
            OPTIONAL { ?cls rdfs:label ?label }
        }
        GROUP BY ?cls ?com ?label
    
Adding classes to graph: 100%|████████████████████████████████████████████████████| 50/50 [00:16<00:00,  3.02it/s]
2024-03-07 15:13:55,880 - INFO - namespaces [('brick', 'https://brickschema.org/schema/Brick#'), ('csvw', 'http://www.w3.org/ns/csvw#'), ('dc', 'http://purl.org/dc/elements/1.1/'), ('dcat', 'http://www.w3.org/ns/dcat#'), ('dcmitype', 'http://purl.org/dc/dcmitype/'), ('dcterms', 'http://purl.org/dc/terms/'), ('dcam', 'http://purl.org/dc/dcam/'), ('doap', 'http://usefulinc.com/ns/doap#'), ('foaf', 'http://xmlns.com/foaf/0.1/'), ('geo', 'http://www.opengis.net/ont/geosparql#'), ('odrl', 'http://www.w3.org/ns/odrl/2/'), ('org', 'http://www.w3.org/ns/org#'), ('prof', 'http://www.w3.org/ns/dx/prof/'), ('prov', 'h

The namespace prefixes are: [('brick', 'https://brickschema.org/schema/Brick#'), ('csvw', 'http://www.w3.org/ns/csvw#'), ('dc', 'http://purl.org/dc/elements/1.1/'), ('dcat', 'http://www.w3.org/ns/dcat#'), ('dcmitype', 'http://purl.org/dc/dcmitype/'), ('dcterms', 'http://purl.org/dc/terms/'), ('dcam', 'http://purl.org/dc/dcam/'), ('doap', 'http://usefulinc.com/ns/doap#'), ('foaf', 'http://xmlns.com/foaf/0.1/'), ('geo', 'http://www.opengis.net/ont/geosparql#'), ('odrl', 'http://www.w3.org/ns/odrl/2/'), ('org', 'http://www.w3.org/ns/org#'), ('prof', 'http://www.w3.org/ns/dx/prof/'), ('prov', 'http://www.w3.org/ns/prov#'), ('qb', 'http://purl.org/linked-data/cube#'), ('schema', 'https://schema.org/'), ('sh', 'http://www.w3.org/ns/shacl#'), ('skos', 'http://www.w3.org/2004/02/skos/core#'), ('sosa', 'http://www.w3.org/ns/sosa/'), ('ssn', 'http://www.w3.org/ns/ssn/'), ('time', 'http://www.w3.org/2006/time#'), ('vann', 'http://purl.org/vocab/vann/'), ('void', 'http://rdfs.org/ns/void#'), ('wgs

In [5]:
# Initial setup: defining the temperature for the model and specifying model IDs for GPT-4 usage
temperature = 0 # Temperature setting for the GPT models, influencing randomness in responses.
model_id_gpt4 = "gpt-4" 
model_id = "gpt-4-0125-preview"

# Create instances of the ChatOpenAI class for interacting with the GPT-4 models.
# These instances are configured for specific model versions, with retries and verbose logging enabled.
#defining gpt4 llm for supervisor
llm_gpt4 = ChatOpenAI(temperature=temperature, 
                    model=model_id_gpt4, 
                    max_retries=3,
                    verbose=True
                    ) # Instance for default GPT-4 model

llm = ChatOpenAI(temperature=temperature, 
                    model=model_id, # This instance uses a specific GPT-4 turbo model.
                    max_retries=3,
                    verbose=True,
                    ) # Instance for GPT-4 0125-preview model.


# Setup a GraphSparqlQAChain instance for executing SPARQL queries against a knowledge graph.
# This instance utilizes the llm model for processing and the graph object for data querying.
sparql_chain = GraphSparqlQAChain.from_llm(
    llm_gpt4, graph=graph, verbose=True
)

# Initialize chemical and taxon resolver tools with the llm model for specialized query processing.
chem_res = ChemicalResolver.from_llm(llm=llm, verbose=True)
taxon_res = TaxonResolver()

# Pydantic models for structured input to the resolver tools.
class ChemicalInput(BaseModel):
    query: str = Field(description="natural product compound string")


class SparqlInput(BaseModel):
    question: str = Field(description="the original question from the user")
    entities: str = Field(description="strings containing for all entities, entity name and the corresponding entity identifier")


# Define a list of structured tools for chemical, taxon, target, and SMILES conversion resolution.
#def tools_resolver_creator():
tools_resolver = [
        StructuredTool.from_function(
            name = "CHEMICAL_RESOLVER",
            func = chem_res.run,
            description="The function takes a natural product compound string as input and returns a InChIKey, if InChIKey not found, it returns the NPCClass, NPCPathway or NPCSuperClass.",
            args_schema=ChemicalInput,
        ),
        StructuredTool.from_function(
            name = "TAXON_RESOLVER",
            func=taxon_res.query_wikidata,
            description="The function takes a taxon string as input and returns the wikidata ID.",
        ),
        StructuredTool.from_function(
            name = "TARGET_RESOLVER",
            func=target_name_to_target_id,
            description="The function takes a target string as input and returns the ChEMBLTarget IRI.",
        ),
        StructuredTool.from_function(
            name = "SMILE_CONVERTER",
            func=smiles_to_inchikey,
            description="The function takes a SMILES string as input and returns the InChIKey notation of the molecule.",
        )
        ]
   # return tools_resolver


# Define a list for the SPARQL query runner tool, used for executing knowledge graph queries.   
tool_sparql = [  
     StructuredTool.from_function(
        name = "SPARQL_QUERY_RUNNER",
        func=sparql_chain.run,
        description="The agent resolve the user's question by querying the knowledge graph database. Input should be a question and the resolved entities in the question.",
        args_schema=SparqlInput,
        # return_direct=True,
    )
]


# Generate a list of tool names for reference or display purposes.
tool_names = [tool.name for tool in tools_resolver] # List of tool names from the resolver tools.




#  Helper Utilities to create agent

In [6]:
def create_agent(llm: ChatOpenAI, tools: list, system_prompt: str):
    """
    Creates an AgentExecutor with LLM, set of tools, and system prompt.

    This function initializes a chat prompt template with a system message, placeholders for messages,
    and an agent scratchpad. It then creates an agent using the specified LLM and tools, 
    and wraps this agent in an AgentExecutor for execution.

    Parameters:
    - llm (ChatOpenAI): The language model to be used by the agent for generating responses.
    - tools (list): A list of tools (functions or utilities) that the agent can use to perform actions or generate responses.
    - system_prompt (str): A string that provides initial instructions or information to the agent. This is used to set up the context for the agent's operations.

    Returns:
    - AgentExecutor: An executor object that manages the execution of the agent, allowing the agent to process input and use tools as defined.
    """
    

    # Initialize a ChatPromptTemplate with a system message, placeholders for incoming messages, and an agent scratchpad.
    # This template structures the input to the language model, integrating static and dynamic content.
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                system_prompt,
            ),
            MessagesPlaceholder(variable_name="messages"),
            MessagesPlaceholder(variable_name="agent_scratchpad"),
        ]
    )

    # Create an agent using the provided language model, tools, and the structured prompt.
    # This agent can interact with users, process input, and use tools based on the prompt template.
    agent = create_openai_tools_agent(llm, tools, prompt)

    # Initialize an AgentExecutor to manage and execute the agent's operations.
    # The executor facilitates the interaction between the agent and the tools, handling execution logic.
    executor = AgentExecutor(agent=agent, tools=tools)
    return executor

# Create Agent Supervisor


In [7]:
# Define a list of agent names that will be part of the supervisor system.
members = ["ENPKG_agent", "Sparql_query_runner"]

# Define the system prompt that outlines the role and responsibilities of the supervisor agent,
# including instructions on how to delegate tasks to specialized agents based on the user's question.
system_prompt = """You are a supervisor. As the supervisor, your primary role is to coordinate the flow of information between agents and ensure the appropriate processing of the user question based on its content. You have access to a team of specialized agents: {members}.

Here is a list of steps to help you accomplish your role:

Analyse the user question and delegate functions to the specialized agents below if needed:
If the question mentions any of the following entities: natural product compound, chemical name, taxon name, target, SMILES structure, or numerical value delegate the question to the ENPKG_agent. ENPKG_agent would provide resolved entities needed to generate SPARQL query. For example if the question mentions either caffeine, or Desmodium heterophyllum call ENPKG_agent.
If you have answers from the agent mentioned above, you provide those answers with the user question to the Sparql_query_runner.

If the question does not mention chemical name, taxon name, target name, nor SMILES structure, delegate the question to the agent Sparql_query_runner. The Sparql_query_runner agent will perform further processing and answer the question.

Once the Sparql_query_runner has completed its task and provided the answer, mark the process as FINISH. Do not call the Sparql_query_runner again.

For example, the user provides the following question: For features from Melochia umbellata in PI mode with SIRIUS annotations, get the ones for which a feature in NI mode with the same retention time has the same SIRIUS annotation. Since the question mentions Melochia umbellata you should firstly delegate it to the ENPKG_agent which would provide wikidata IRI with TAXON_RESOLVER tool, and then, you should delegate the question together with the output generated by ENPKG_agent to the Sparql_query_runner agent.

Avoid calling the same agent if this agent has already been called previously and provided the answer. For example, if you have called ENPKG_agent and it provided InChIKey for chemical compound do not call this agent again.

Collect the answer from Sparql_query_runner and provide the final assembled response back to the user.
Always tell the user the SPARQL query that has been returned by the Sparql_query_runner.

If the agent does not provide the expected output mark the process as FINISH.

Remember, your efficiency in routing the questions accurately and collecting responses is crucial for the seamless operation of our system. If you don't know the answer to any of the steps, please say explicitly and help the user by providing a query that you think will be better interpreted.
"""
# Function to create a team supervisor agent that routes tasks based on user questions.
def create_team_supervisor(llm: ChatOpenAI, system_prompt, members) -> str:
    """
    Configures and returns a supervisor agent setup with decision-making logic for task routing.
    
    The supervisor uses a provided language model (llm) to analyze user questions and decides whether to delegate
    the question to specialized agents (members), or to mark the process as finished based on predefined criteria.
    
    Parameters:
    - llm (ChatOpenAI): The language model to be used for processing and routing decisions.
    - system_prompt (str): A detailed prompt describing the supervisor's role and decision-making guidelines.
    - members (list): A list of specialized agents available for task delegation.
    
    Returns:
    - str: A configured prompt or agent setup that integrates routing logic for processing user questions.
    """
    options = ["FINISH"] + members
    function_def = {
        "name": "route",
        "description": "Select the next role.",
        "parameters": {
            "title": "routeSchema",
            "type": "object",
            "properties": {
                "next": {
                    "title": "Next",
                    "anyOf": [
                        {"enum": options},
                    ],
                },
            },
            "required": ["next"],
        },
    }
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            MessagesPlaceholder(variable_name="messages"),
            (
                "system",
                "Given the conversation above, who should act next?"
                " Or should we FINISH? Select one of: {options}",
            ),
        ]
    ).partial(options=str(options), members=", ".join(members))
    return (
        prompt
        | llm.bind_functions(functions=[function_def], function_call="route")
        | JsonOutputFunctionsParser()
    )

## Defining separate agents

In [8]:
# Define the system message for the entity resolution agent (resolver) responsible for processing user questions.
# This message includes instructions for the agent on how to handle different types of entities mentioned in questions.
system_message_resolver = """You are an entity resolution agent for the Sparql_query_runner.
You have access to the following tools:
{tool_names}
You should analyze the question and provide resolved entities to the supervisor. Here is a list of steps to help you accomplish your role:
If the question ask anything about any entities that could be natural product compound, find the relevant IRI to this chemical class using CHEMICAL_RESOLVER. Input is the chemical class name. For example, if salicin is mentioned in the question, provide its IRI using CHEMICAL_RESOLVER, input is salicin. 

If a taxon is mentionned, find what is its wikidata IRI with TAXON_RESOLVER. Input is the taxon name. For example, if the question mentions acer saccharum, you should provide it's wikidata IRI using TAXON_RESOLVER tool.")

If a target is mentionned, find the ChEMBLTarget IRI of the target with TARGET_RESOLVER. Input is the target name.

If a SMILE structure is mentionned, find what is the InChIKey notation of the molecule with SMILE_CONVERTER. Input is the SMILE structure. For example, if there is a string with similar structure to CCC12CCCN3C1C4(CC3) in the question, provide it to SMILE_CONVERTER.
        
Give me units relevant to numerical values in this question. Return nothing if units for value is not provided.
Be sure to say that these are the units of the quantities found in the knowledge graph.
Here is the list of units to find:
    "retention time": "minutes",
    "activity value": null, 
    "feature area": "absolute count or intensity", 
    "relative feature area": "normalized area in percentage", 
    "parent mass": "ppm (parts-per-million) for m/z",
    "mass difference": "delta m/z", 
    "cosine": "score from 0 to 1. 1 = identical spectra. 0 = completely different spectra"


 You are required to submit only the final answer to the supervisor.
        
""".format(tool_names="\n".join(tool_names))


# Create an agent for entity resolution based on the instructions provided in `system_message_resolver`.
enpkg_agent=create_agent(llm, tools_resolver, system_message_resolver)


# Create an agent for running SPARQL queries based on user requests and resolved entities provided by other agents.
sparql_query_agent = create_agent(llm, tool_sparql, "You are sparql query runner, you take as input the user request and resolved entities provided by other agents, generate a SPARQL query, run it on the knowledge graph and answer the question using SPARQL_QUERY_RUNNER tool. Provide the answer to the question to the supervisor. You are required to submit only the final answer to the supervisor. If you could not get the answer, provide the SPARQL query generated.")





# Construct Graph

In [9]:
# The agent state is the input to each node in the graph
class AgentState(TypedDict):
    # The annotation tells the graph that new messages will always
    # be added to the current states
    messages: Annotated[Sequence[BaseMessage], operator.add]
    # The 'next' field indicates where to route to next
    next: str

#function to define nodes
def agent_node(state, agent, name):
    result = agent.invoke(state)
    return {"messages": [HumanMessage(content=result["output"], name=name)]}

#creating nodes for each agent
enpkg_node= functools.partial(agent_node, agent=enpkg_agent, name="ENPKG_agent")
sparql_query_node= functools.partial(agent_node, agent=sparql_query_agent, name="Sparql_query_runner")
supervisor_agent=create_team_supervisor(llm_gpt4, 
                                        system_prompt, members)

#creating the workflow and adding nodes to it
workflow = StateGraph(AgentState)
workflow.add_node("ENPKG_agent", enpkg_node)
workflow.add_node("Sparql_query_runner",sparql_query_node)
workflow.add_node("supervisor", supervisor_agent)
#connect all the edges in the graph
for member in members:
    # We want our workers to ALWAYS "report back" to the supervisor when done
    workflow.add_edge(member, "supervisor")

workflow.add_conditional_edges("supervisor",
    lambda x: x["next"],
    {"ENPKG_agent": "ENPKG_agent", "Sparql_query_runner": "Sparql_query_runner",  "FINISH": END}
)

memory = SqliteSaver()
workflow.set_entry_point("supervisor")
app= workflow.compile(checkpointer=memory)

In [10]:
q1 = 'How many features (pos ionization and neg ionization modes) have the same SIRIUS/CSI:FingerID and ISDB annotation by comparing the InCHIKey of the annotations?'
q1_bis = 'How many features (pos ionization and neg ionization modes) have the same SIRIUS/CSI:FingerID and ISDB annotation by comparing the InCHIKey2D of the annotations?'
q2 = 'Which extracts have features (pos ionization mode) annotated as the class, aspidosperma-type alkaloids, by CANOPUS with a probability score above 0.5, ordered by the decresing count of features as aspidosperma-type alkaloids? Group by extract.'
q3 = 'Among the structural annotations from the Tabernaemontana coffeoides (Apocynaceae) seeds extract taxon , which ones contain an aspidospermidine substructure, CCC12CCCN3C1C4(CC3)C(CC2)NC5=CC=CC=C45?'
q4 = 'Among the SIRIUS structural annotations from the Tabernaemontana coffeoides (Apocynaceae) seeds extract taxon, which ones are reported in the Tabernaemontana genus in Wikidata?'
q5 = 'Which compounds have annotations with chembl assay results indicating reported activity against T. cruzi by looking at the cosmic, zodiac and taxo scores?'
q5_bis = 'Which compounds have annotations with chembl assay results indicating reported activity against Trypanosoma cruzi by looking at the cosmic, zodiac and taxo scores?'
q6 = 'Filter the pos ionization mode features of the Melochia umbellata taxon annotated as [M+H]+ by SIRIUS to keep the ones for which a feature in neg ionization mode is detected with the same retention time (+/- 3 seconds) and a mass corresponding to the [M-H]- adduct (+/- 5ppm).'
q7 = 'For features from the Melochia umbellata taxon in pos ionization mode with SIRIUS annotations, get the ones for which a feature in neg ionization mode with the same retention time (+/- 3 seconds) has the same SIRIUS annotation by comparing the InCHIKey 2D. Return the features, retention times, and InChIKey2D'
q8 = "Which features were annotated as 'Tetraketide meroterpenoids' by SIRIUS, and how many such features were found for each species and plant part?"
q9 = "What are all distinct submitted taxons for the extracts in the knowledge graph?"
q10 = "What are the taxons, lab process and label (if one exists) for each sample? Sort by sample and then lab process"
q11 = "Count all the species per family in the collection"

q12 = "Taxons can be found in enpkg:LabExtract. Find the best URI of the Taxon in the context of this question : \n Among the structural annotations from the Tabernaemontana coffeoides (Apocynaceae) seeds extract taxon , which ones contain an aspidospermidine substructure, CCC12CCCN3C1C4(CC3)C(CC2)NC5=CC=CC=C45?"
q13 = "Which compounds annotated in the active extract of Melochia umbellata have activity against Trypanosoma cruzi reported (in ChEMBL)?"

In [11]:
import requests

def test_sparql_endpoint(endpoint):
    """
    Tests the validity of a SPARQL endpoint by sending a simple ASK query.
    Parameters:
    - endpoint (str): The URL of the SPARQL endpoint to test.
    Returns:
    - bool: True if the endpoint is valid and responsive, False otherwise.
    """
    test_query = {"query": "ASK WHERE { ?s ?p ?o } LIMIT 1"}
    headers = {"Accept": "application/sparql-results+json"}
    try:
        response = requests.get(endpoint, params=test_query, headers=headers)
        response.raise_for_status()  # Ensures HTTP request was successful
        # Validate response format
        if response.json() is not None:
            return True
        else:
            print("The endpoint did not return a valid SPARQL result.")
            return False
            
    except requests.RequestException as e:
        print(f"Failed to connect to the endpoint: {e}")
        return False

In [12]:
result=test_sparql_endpoint(endpoint_url)
print(result)

True


In [13]:
for s in app.stream(
    {
        "messages": [
            HumanMessage(content=q3)
        ]
    },
    {"configurable": {"thread_id": "2"}}
):
    if "__end__" in s:
        print(s)
        print("----")

2024-03-07 15:14:29,771 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-03-07 15:14:32,534 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




[1m> Entering new ChemicalResolver chain...[0m
InChIKey is aspidospermidine: InChIKey=YAAIPCQYJYPITK-UHFFFAOYSA-N

[1m> Finished chain.[0m


2024-03-07 15:14:36,346 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-03-07 15:14:39,191 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-03-07 15:14:42,912 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-03-07 15:15:02,767 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-03-07 15:15:04,008 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




[1m> Entering new GraphSparqlQAChain chain...[0m


  warn_deprecated(
2024-03-07 15:15:18,482 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Generated SPARQL:
[32;1m[1;3mPREFIX ns2: <https://enpkg.commons-lab.org/kg/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?annotation
WHERE {
    ?taxon ns2:has_wd_id <http://www.wikidata.org/entity/Q15376858> .
    ?taxon ns2:has_lab_process ?extract .
    ?extract ns2:has_sirius_annotation ?annotation .
    ?annotation ns2:has_InChIkey2D ?inchikey .
    ?inchikey ns2:has_smiles "CCC12CCCN3C1C4(CC3)C(CC2)NC5=CC=CC=C45"^^xsd:string .
}[0m

[1m> Finished chain.[0m


2024-03-07 15:15:19,776 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-03-07 15:15:23,072 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-03-07 15:15:24,406 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




[1m> Entering new GraphSparqlQAChain chain...[0m


2024-03-07 15:15:37,458 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Generated SPARQL:
[32;1m[1;3mPREFIX ns2: <https://enpkg.commons-lab.org/kg/>
PREFIX wd: <http://www.wikidata.org/entity/>
SELECT ?annotation
WHERE {
    ?extract ns2:has_wd_id wd:Q15376858 .
    ?extract ns2:has_sirius_annotation ?annotation .
    ?annotation ns2:has_InChIkey2D ?inchikey .
    ?inchikey ns2:has_smiles "CCC12CCCN3C1C4(CC3)C(CC2)NC5=CC=CC=C45" .
}[0m

[1m> Finished chain.[0m


2024-03-07 15:15:38,796 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-03-07 15:15:42,001 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-03-07 15:15:42,704 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-03-07 15:15:45,421 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-03-07 15:15:46,152 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-03-07 15:15:52,798 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'__end__': {'messages': [HumanMessage(content='Among the structural annotations from the Tabernaemontana coffeoides (Apocynaceae) seeds extract taxon , which ones contain an aspidospermidine substructure, CCC12CCCN3C1C4(CC3)C(CC2)NC5=CC=CC=C45?'), HumanMessage(content='The structural annotations from the *Tabernaemontana coffeoides* (Apocynaceae) seeds extract that contain an aspidospermidine substructure, with the SMILES structure `CCC12CCCN3C1C4(CC3)C(CC2)NC5=CC=CC=C45`, have been identified. The InChIKey for aspidospermidine is `YAAIPCQYJYPITK-UHFFFAOYSA-N`, and the taxon *Tabernaemontana coffeoides* is represented by the wikidata IRI http://www.wikidata.org/entity/Q15376858.', name='ENPKG_agent'), HumanMessage(content='There are no structural annotations from the *Tabernaemontana coffeoides* (Apocynaceae) seeds extract that contain an aspidospermidine substructure with the specified SMILES structure.', name='Sparql_query_runner'), HumanMessage(content='There are no structural anno

In [23]:
def process_stream(app, q2):
    try:
        # Iterate over the stream from app.stream()
        for s in app.stream(
            {
                "messages": [
                    HumanMessage(content=q2)  # Assuming q2 is the content of the message
                ]
            },
            {"recursion_limit": 100},  # Additional options for the stream
        ):
            # Check if "__end__" is not in the stream output
            if "__end__" not in s:
                print(s)  # Print the stream output
                print("----")  # Print the delimiter
    except Exception as e:
        print(f"An error occurred: {e}")

In [24]:
process_stream(app, q2)

2024-02-28 09:49:23,336 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'supervisor': {'next': 'ENPKG_agent'}}
----


2024-02-28 09:49:25,180 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




[1m> Entering new ChemicalResolver chain...[0m
InChIKey not found, trying NPC Classifier


2024-02-28 09:49:28,763 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-02-28 09:49:31,836 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-02-28 09:49:34,552 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-02-28 09:49:38,081 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



[1m> Finished chain.[0m


2024-02-28 09:49:38,902 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'ENPKG_agent': {'messages': [HumanMessage(content='The class "aspidosperma-type alkaloids" is annotated with the IRI: https://enpkg.commons-lab.org/kg/npc_Aspidosperma_type, under the category NPCClass.', name='ENPKG_agent')]}}
----


2024-02-28 09:49:42,485 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'supervisor': {'next': 'Sparql_query_runner'}}
----


2024-02-28 09:49:45,974 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




[1m> Entering new GraphSparqlQAChain chain...[0m


2024-02-28 09:50:07,776 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Generated SPARQL:
[32;1m[1;3mPREFIX ns1: <https://enpkg.commons-lab.org/kg/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT ?extract (COUNT(?feature) AS ?countFeatures)
WHERE {
    ?feature ns1:has_canopus_annotation ?annotation .
    ?annotation ns1:has_canopus_npc_class <https://enpkg.commons-lab.org/kg/npc_Aspidosperma_type> .
    ?annotation ns1:has_canopus_npc_class_prob ?prob .
    FILTER(?prob > 0.5) .
    ?feature ns1:has_ionization "pos" .
    ?extract ns1:has_lcms_feature_list ?featureList .
    ?featureList ns1:has_lcms_feature ?feature .
}
GROUP BY ?extract
ORDER BY DESC(?countFeatures)[0m

[1m> Finished chain.[0m


2024-02-28 09:50:09,824 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'Sparql_query_runner': {'messages': [HumanMessage(content='The extracts with features annotated as the class "aspidosperma-type alkaloids" by CANOPUS with a probability score above 0.5, ordered by the decreasing count of features as aspidosperma-type alkaloids, are as follows:\n\n1. [VGF152_B02_pos.mzXML](https://enpkg.commons-lab.org/kg/VGF152_B02_pos.mzXML) with 74 features.\n2. [VGF157_D02_pos.mzXML](https://enpkg.commons-lab.org/kg/VGF157_D02_pos.mzXML) with 11 features.\n3. [VGF147_B11_pos.mzXML](https://enpkg.commons-lab.org/kg/VGF147_B11_pos.mzXML) with 10 features.\n4. [VGF153_C03_pos.mzXML](https://enpkg.commons-lab.org/kg/VGF153_C03_pos.mzXML) with 7 features.\n5. [VGF157_E02_pos.mzXML](https://enpkg.commons-lab.org/kg/VGF157_E02_pos.mzXML) with 2 features.\n6. [VGF154_D02_pos.mzXML](https://enpkg.commons-lab.org/kg/VGF154_D02_pos.mzXML) with 2 features.\n7. [VGF147_A10_pos.mzXML](https://enpkg.commons-lab.org/kg/VGF147_A10_pos.mzXML) with 2 features.\n8. [VGF140_F02_pos.mzX

2024-02-28 09:50:28,666 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'supervisor': {'next': 'FINISH'}}
----


In [15]:
def get_response_from_langchain(app, question):
    final_response = ""  # Initialize an empty string to hold the final response
    for s in app.stream(
        {
            "messages": [
                HumanMessage(content=question)
            ]
        },
        {"recursion_limit": 100},
    ):
        if "__end__" not in s:
            # Assuming 's' contains a response you want to display,
            # you can concatenate or process it as needed.
            # This example assumes 's' is a dictionary with a 'text' key containing the response.
            # Adjust the key according to your actual response structure.
            final_response += s.get('text', '') + "\n"
        else:
            break  # Exit the loop if an end condition is met
    return final_response.strip()  # Return the final response, removing any trailing newline

In [16]:
result=get_response_from_langchain(app, q2)


2024-02-26 15:41:18,222 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-26 15:41:20,095 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




[1m> Entering new ChemicalResolver chain...[0m
InChIKey not found, trying NPC Classifier


2024-02-26 15:41:21,743 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-02-26 15:41:24,382 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



[1m> Finished chain.[0m


2024-02-26 15:41:25,376 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-26 15:41:29,297 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-26 15:41:30,833 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




[1m> Entering new GraphSparqlQAChain chain...[0m


2024-02-26 15:41:52,952 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Generated SPARQL:
[32;1m[1;3mPREFIX ns1: <https://enpkg.commons-lab.org/kg/>
PREFIX ns2: <https://enpkg.commons-lab.org/module/>
SELECT ?extract (COUNT(?feature) AS ?countFeatures)
WHERE {
  ?feature ns1:has_canopus_annotation ?annotation .
  ?annotation ns1:has_canopus_npc_class <https://enpkg.commons-lab.org/kg/npc_Aspidosperma_type> .
  ?annotation ns1:has_canopus_npc_class_prob ?prob .
  FILTER(?prob > 0.5) .
  ?feature ns1:has_ionization "pos" .
  ?extract ns1:has_lcms_feature_list ?featureList .
  ?featureList ns1:has_lcms_feature ?feature .
}
GROUP BY ?extract
ORDER BY DESC(?countFeatures)[0m

[1m> Finished chain.[0m


2024-02-26 15:41:54,694 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-26 15:43:26,955 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [17]:
print(result)




In [36]:
def process_stream(app, q2):
    results = []  # Initialize an empty list to store results
    try:
        # Iterate over the stream from app.stream()
        for s in app.stream(
            {
                "messages": [
                    HumanMessage(content=q2)  # Assuming q2 is the content of the message
                ]
            },
            {"recursion_limit": 100},  # Additional options for the stream
        ):
            # Check if "__end__" is not in the stream output
            if "__end__" not in s:
                results.append(s)  # Append the stream output to results list instead of printing
    except Exception as e:
        print(f"An error occurred: {e}")
    return results  # Return the list of results

In [37]:
result=process_stream(app, q13)
print(result)

2024-02-27 11:51:31,391 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-27 11:51:34,145 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-27 11:51:36,606 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-27 11:51:39,806 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-27 11:51:41,640 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




[1m> Entering new GraphSparqlQAChain chain...[0m


2024-02-27 11:51:55,147 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Generated SPARQL:
[32;1m[1;3mPREFIX ns1: <https://enpkg.commons-lab.org/module/>
PREFIX ns2: <https://enpkg.commons-lab.org/kg/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT ?compound
WHERE {
  ?rawMaterial ns2:has_wd_id <http://www.wikidata.org/entity/Q6813281> .
  ?rawMaterial ns2:has_lab_process ?labExtract .
  ?labExtract ns1:has_bioassay_results ?bioAssayResults .
  ?bioAssayResults ns1:target_id ?target .
  ?target ns1:target_name "Trypanosoma cruzi" .
  ?bioAssayResults ns1:inhibition_percentage ?inhibition .
  FILTER(?inhibition > 0)
  ?labExtract ns2:has_sirius_annotation ?annotation .
  ?annotation ns2:has_InChIkey2D ?InChIkey2D .
  ?InChIkey2D ns2:is_InChIkey2D_of ?compound .
}[0m

[1m> Finished chain.[0m


2024-02-27 11:51:56,273 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-27 11:51:58,730 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[{'supervisor': {'next': 'ENPKG_agent'}}, {'ENPKG_agent': {'messages': [HumanMessage(content='The compounds annotated in the active extract of Melochia umbellata (http://www.wikidata.org/entity/Q6813281) with activity against Trypanosoma cruzi (https://www.ebi.ac.uk/chembl/target_report_card/CHEMBL368) reported in ChEMBL are being queried.', name='ENPKG_agent')]}}, {'supervisor': {'next': 'Sparql_query_runner'}}, {'Sparql_query_runner': {'messages': [HumanMessage(content='There are no compounds annotated in the active extract of Melochia umbellata with activity against Trypanosoma cruzi reported in ChEMBL.', name='Sparql_query_runner')]}}, {'supervisor': {'next': 'FINISH'}}]


In [24]:
result=process_stream(app, q5_bis)
print(result)

2024-02-27 11:42:53,559 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-27 11:42:55,259 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-27 11:42:56,836 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-27 11:43:00,032 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-27 11:43:01,740 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-27 11:43:04,709 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-27 11:43:07,781 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




[1m> Entering new GraphSparqlQAChain chain...[0m


  warn_deprecated(
2024-02-27 11:43:25,522 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Generated SPARQL:
[32;1m[1;3mPREFIX ns1: <https://enpkg.commons-lab.org/kg/>
PREFIX ns2: <https://enpkg.commons-lab.org/module/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT ?compound
WHERE {
  ?assayResults ns2:target_id <https://www.ebi.ac.uk/chembl/target_report_card/CHEMBL368> ;
                ns2:activity_value ?activityValue .
  ?annotation ns1:has_cosmic_score ?cosmicScore ;
              ns1:has_zodiac_score ?zodiacScore ;
              ns1:has_taxo_score ?taxoScore ;
              ns1:has_InChIkey2D ?inChIkey2D .
  ?compound ns1:has_sirius_annotation ?annotation .
}[0m

[1m> Finished chain.[0m


2024-02-27 11:43:27,235 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-27 11:43:29,183 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[{'supervisor': {'next': 'ENPKG_agent'}}, {'ENPKG_agent': {'messages': [HumanMessage(content='The compounds that have annotations with ChEMBL assay results indicating reported activity against Trypanosoma cruzi can be identified by looking at the cosmic, zodiac, and taxo scores, with the relevant ChEMBL Target IRI for Trypanosoma cruzi being https://www.ebi.ac.uk/chembl/target_report_card/CHEMBL368.', name='ENPKG_agent')]}}, {'supervisor': {'next': 'ENPKG_agent'}}, {'ENPKG_agent': {'messages': [HumanMessage(content='The relevant ChEMBL Target IRI for Trypanosoma cruzi is https://www.ebi.ac.uk/chembl/target_report_card/CHEMBL368.', name='ENPKG_agent')]}}, {'supervisor': {'next': 'Sparql_query_runner'}}, {'Sparql_query_runner': {'messages': [HumanMessage(content='There are no compounds found that have annotations with ChEMBL assay results indicating reported activity against Trypanosoma cruzi by looking at the cosmic, zodiac, and taxo scores.', name='Sparql_query_runner')]}}, {'superviso

In [50]:
def process_stream(question):
    """
    Iterates over messages from app.stream(), printing each message until an "__end__" flag is encountered.

    :param question: The question to be sent as part of the initial stream request.
    """
    # Define the parameters for app.stream() as described
    stream_params = {
        "messages": [{"content": question}],
    }
    stream_options = {"recursion_limit": 100}

    try:
        for s in app.stream(stream_params, stream_options):
            if "__end__" not in s:
                print(s)
                print("----")
            else:
                break  # Exit the loop if "__end__" is encountered
    except Exception as e:
        print(f"An error occurred during streaming: {e}")

In [51]:
result=process_stream(q2)

An error occurred during streaming: Message dict must contain 'role' and 'content' keys, got {'content': 'Which extracts have features (pos ionization mode) annotated as the class, aspidosperma-type alkaloids, by CANOPUS with a probability score above 0.5, ordered by the decresing count of features as aspidosperma-type alkaloids? Group by extract.'}


In [14]:
for s in app.stream(
    {
        "messages": [
            HumanMessage(content=q1_bis)
        ]
    },
    {"recursion_limit": 100},
):
    if "__end__" not in s:
        print(s)
        print("----")

2024-02-26 10:07:39,506 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'supervisor': {'next': 'Sparql_query_runner'}}
----


2024-02-26 10:07:41,303 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




[1m> Entering new GraphSparqlQAChain chain...[0m


  warn_deprecated(
2024-02-26 10:08:19,979 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Generated SPARQL:
[32;1m[1;3mPREFIX ns1: <https://enpkg.commons-lab.org/kg/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT (COUNT(DISTINCT ?feature) AS ?numberOfFeatures)
WHERE {
  ?featurePos ns1:has_ionization "pos" ;
              ns1:has_sirius_annotation ?siriusAnnotationPos ;
              ns1:has_isdb_annotation ?isdbAnnotationPos .
  ?siriusAnnotationPos ns1:has_InChIkey2D ?inChIkey2DPos .
  ?isdbAnnotationPos ns1:has_InChIkey2D ?inChIkey2DPos .

  ?featureNeg ns1:has_ionization "neg" ;
              ns1:has_sirius_annotation ?siriusAnnotationNeg ;
              ns1:has_isdb_annotation ?isdbAnnotationNeg .
  ?siriusAnnotationNeg ns1:has_InChIkey2D ?inChIkey2DNeg .
  ?isdbAnnotationNeg ns1:has_InChIkey2D ?inChIkey2DNeg .

  FILTER(?inChIkey2DPos = ?inChIkey2DNeg)
  BIND(CONCAT(STR(?featurePos), STR(?featureNeg)) AS ?feature)
}[0m


KeyboardInterrupt: 

In [None]:
agent_executor.invoke({"input": q8})

In [None]:
agent_executor.invoke({"input": q9})

In [None]:
agent_executor.invoke({"input": q10})

In [None]:
agent_executor.invoke({"input": q11})

In [None]:
agent_executor.invoke({"input": q12})