# ENPKG Notebook
### Workbook used while designing and debugging KGAI. No longer up to date

In [1]:
# general python libs
import json
import os

# langchain
import langchain
from langchain import LLMChain
from langchain.prompts.prompt import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.agents import Tool, AgentType, initialize_agent
from langchain.callbacks import get_openai_callback
from langchain.memory import ConversationBufferMemory
from langchain.chains.prompt_selector import ConditionalPromptSelector


In [2]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [6]:
import importlib
import tools_script
importlib.reload(tools_script)

from tools_script import RunSparql, QueryTool, nested_value, split_data, DBRetriever, SimpleAgent, count_tokens
import prompts
importlib.reload(prompts)
import prompts

import enpkg_agent
importlib.reload(enpkg_agent)
from enpkg_agent import run_agent

import enpkg_tools
importlib.reload(enpkg_tools)
from enpkg_tools import make_tools


In [99]:
endpoint_url = 'http://Emmas-MacBook-Pro-2019.local:7200/repositories/ENPKG_local'

# Tools

## Run SPARQL Tool

In [128]:
# Set the endpoint URLs for the SPARQL queries
endpoint_url = "https://enpkg.commons-lab.org/graphdb/repositories/ENPKG"

# Set the tool name and description for the SparqlQueryRunner
tool_name = "SparqlQueryRunner"
tool_desc = "Useful to run Sparql queries after keywords have been extracted and identifiers have been found."

# Create an instance of the RunSparql class
run_sparql_query = RunSparql(endpoint_url, tool_name, tool_desc)

# Create the tool using the specified endpoint URL, tool name, and description
run_sparql_query.make_tool()


## Taxon Tool

In [45]:
# Set the SPARQL query template for retrieving taxon information
taxon_template = """
PREFIX enpkg: <https://enpkg.commons-lab.org/kg/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX enpkgmodule: <https://enpkg.commons-lab.org/module/>
select * where {{ 
    ?extract rdf:type enpkg:LabExtract .
    ?process enpkg:has_lab_process ?extract ; 
             enpkg:submitted_taxon '{taxon}' ;
             enpkgmodule:has_broad_organe ?broad_organe ;
             enpkgmodule:has_organe ?organe ;
             enpkgmodule:has_subsystem ?subsystem ;
             enpkgmodule:has_tissue ?tissue ;
}} 
"""

# Set the tool name and description for the TaxonLookup tool
taxon_tool_name = 'TaxonLookup'
taxon_tool_desc = """Use tool when need to get extract ID for a species or taxon. 
                    Input should be taxon name.
                    Returns dictionary of possible extract URIs for the taxon of interest with other information."""

# Define a function to format the output data from the SPARQL query
def taxon_format(output):
    formatted_data = {}
    for item in output:
        entity_id = item['extract']['value']
        formatted_data[entity_id] = {x: item[x]['value'].split('/')[-1] for x in item if x != 'extract'}
        formatted_data[entity_id]['Type'] = 'https://enpkg.commons-lab.org/kg/LabExtract'

    if formatted_data == {}:
        return "No taxons were found. Either the specific taxon doesn't exist in the knowledge graph or the question is asking about all taxons. Either way tell the agent to precede."
    return formatted_data

# Create an instance of the QueryTool class for the TaxonLookup tool
taxon_class = QueryTool(taxon_template, taxon_tool_desc, taxon_tool_name, taxon_format, endpoint_url)

# Create the TaxonLookup tool using the specified template, name, description, formatting function, and endpoint URL
taxon_class.make_tool()

# Set the SPARQL query template for finding the best URI using available tools
taxon_agent_template = """
Find the best uri for {input} using the tools available: \
1. TaxonLookup

Return the best uri in context of this question ({question}) and say that it is an instance of the class – https://enpkg.commons-lab.org/kg/LabExtract.
"""
#the type (https://enpkg.commons-lab.org/kg/LabExtract)

# Create the TaxonTool using the specified template, list of tools (in this case only TaxonLookup), and name
taxon_agent_desc = 'Use when you need to know the uri for a taxon. Return best extract URI in the context of the question '
taxon_agent = SimpleAgent(taxon_agent_template.format(input="{input}", question="{question}"), 'TaxonTool', taxon_agent_desc, [taxon_class.tool],parser = 'keyword_question', model = 'gpt-4')
taxon_agent.make_tool()



## Chemical Class Tool

In [118]:
# Sorting function
def find_noids_word(s):
    words = s.split('_')
    for word in words:
        if "noids" in word or "loids" in word:
            return word.lower()
    return s.lower()

# Define a function to parse a file containing JSON data and extract values from nested keys
def class_parser(filepath):
    # Open the file and load the JSON data
    with open(filepath, 'r') as f:
        data = json.load(f)

    def remove_prefix(uri, prefix='https://enpkg.commons-lab.org/kg/'):
        return uri.split(prefix, 1)[-1]
    
    # Extract the values from the nested keys 'class' and 'value'
    class_list = [remove_prefix(nested_value(x, ['class', 'value'])) for x in data['results']['bindings']]
    class_list.sort(key=find_noids_word)
    return class_list #'enpkg:'+

# Set the template for the class tool
class_template = """
Give me npc_class uris relevant to this chemical class: {question}. Return nothing if the question don't ask anything about
any entities that could be classes. Return the uri + the prefix: 'enpkg:'
"""

# Create a PromptTemplate instance for the class tool, specifying input variables and the template
class_prompt = PromptTemplate(
    input_variables=['question'], #'input',
    template=class_template
)

# Set the description and name for the class tool
class_tool_desc = 'Use to retrieve URIs for chemical classes mentioned in the question.'
class_tool_name = 'ClassTool'

# Set the filepath of the JSON file containing the class data
class_filepath = './local_files/npc_classes.json'


In [15]:
# get embeddings for chemical class doc
ClassDB = split_data(class_parser(class_filepath), chunk_size=500)

Split the documents
You have 20 document(s) in your data


In [119]:
# Create an instance of the db_class using the provided parameters
NPCClass = DBRetriever(class_prompt, class_filepath, class_tool_name, class_tool_desc, class_parser)

# Load embeddings for the NPCClass instance using the provided ClassDB
NPCClass.load_embeddings(ClassDB)

# Make the tool ready for use by calling the make_tool() method on the NPCClass instance
NPCClass.make_tool(docs=6)


## Target Name Tool

In [17]:
target_prompt_template = """For this target: {input}, what is the most relevant target name? 
'Leishmania donovani', 'Trypanosoma cruzi', 'Trypanosoma brucei rhodesiense'
Answer with either one of those 3 options or say there is no relevant target name
""" 

target_prompt = PromptTemplate(input_variables=["input"],
                               template=target_prompt_template)

target_agent_desc = 'Use to get the correct string for the target name to use in SPARQL query'

target_agent = SimpleAgent(target_prompt_template, 'TargetTool', target_agent_desc, [], model='gpt-3.5-turbo')
target_agent.make_tool()


## Structure Tool

In [25]:
# Set the SPARQL query template for subquering structures
structure_template = """
PREFIX idsm: <https://idsm.elixir-czech.cz/sparql/endpoint/>
PREFIX sachem: <http://bioinfo.uochb.cas.cz/rdf/v1.0/sachem#>

?ik enpkg:has_wd_id ?wd_id .
    SERVICE idsm:wikidata {{
        VALUES ?SUBSTRUCTURE {{"{input}"}} 
        ?wd_id sachem:substructureSearch _:b16.
        _:b16 sachem:query ?SUBSTRUCTURE.
    }}
"""


# Create sparql template format tool
def format_structure_template(smiles):
    return structure_template.format(input=smiles)

format_structure_tool = Tool(name = 'StructureQueryTool',
                             description= 'Format the structure sparql template',
                             func = format_structure_template)

structure_agent_template = """
Instructions:
1. Check if input is a SMILES chemical structure. If yes continue, else say that the input is not a SMILES so need to ask user for the SMILES.
2. Run the tool StructureQueryTool with the input.
3. Return SPARQL subquery. 
Input: {input}
"""

# Create the TaxonTool using the specified template, list of tools (in this case only TaxonLookup), and name
structure_agent_desc = 'Use when you need to get the sparql subquery to retrieve structures.'
structure_agent = SimpleAgent(structure_agent_template, 'StructureTool', structure_agent_desc, [format_structure_tool])
structure_agent.make_tool()



## Tool: Unit Tool

In [116]:
# Define a function to parse a file containing JSON data and extract values from nested keys
def unit_parser(filepath):
    # Open the file and load the JSON data
    with open(filepath, 'r') as f:
        data = json.load(f)

    return data


# Set the template for the unit tool
unit_template = """
Give me units relevant to numerical values in this question: {question}. Return nothing if units for value is not provided.
"""

# Create a PromptTemplate instance for the unit tool, specifying input variables and the template
unit_prompt = PromptTemplate(
    input_variables=['question'], 
    template=unit_template
)

# Set the description and name for the unit tool
unit_tool_desc = 'Use to retrieve units if question mentions a numerical measure.'
unit_tool_name = 'UnitTool'

# Set the filepath of the JSON file containing the unit data
unit_filepath = './local_files/ENPKG_units.json'


In [72]:
UnitDB = split_data(unit_parser(unit_filepath), chunk_size=500)

Split the documents
You have 1 document(s) in your data


In [117]:
# Create an instance of the db_class using the provided parameters
Unit = DBRetriever(unit_prompt, unit_filepath, unit_tool_name, unit_tool_desc, unit_parser)

# Load embeddings for the NPCClass instance using the provided ClassDB
Unit.load_embeddings(UnitDB)

# Make the tool ready for use by calling the make_tool() method on the NPCClass instance
Unit.make_tool(docs=1)


## Tool: Identifier Lookup (NO LONGER USED)

In [6]:
def identifier_lookup(search, entity_type = "entity",
                 limit = 50, 
                 info_type = ['label', 'description'] # information to include with id
                ):    

    search = f"'{search}'"
    SPARQL_template = """
    SELECT DISTINCT ?entity ?entityLabel ?entityDescription
    WHERE {{
    {{
        ?entity rdfs:label {searchstr}@en .
        ?entity rdfs:label ?entityLabel
        OPTIONAL {{
          ?entity rdfs:comment ?entityDescription .
        }}
      }}
    }}
    """ 
    
    try:
        output = run_sparql(SPARQL_template.format(searchstr=search))


        formatted_data = {}

        for item in output:
            entity_id = item['entity']['value'].split('/')[-1]
            label = item['entityLabel']['value']
            description = item.get('entityDescription', {}).get('value', '')
            item_type = 'property' if entity_id[0] == 'P' else 'entity'
            formatted_data[entity_id] = {'label': label, 'description': description}
        return formatted_data
    except:
        return "Sorry, I got an error. Please try again."

In [328]:
item_template = """
Find the best uri for {input} using the tools available: \
1. ItemLookup

Return the best uri with the label and description in context of this question: {question}
"""

property_template = """
Find the best p-number for {input} using the tools available: \
1. PropertyLookup

Return the best p-number with the label and description in context of this question: {question}. 

Note: don't answer the question.
"""

item_tool = [Tool(
    name = 'ItemLookup',
    description = 'Useful when you need to know the uri for an item in order to generate a sparql request. \
        Provides a list of possible identifiers.',
    func = lambda item: identifier_lookup(item, entity_type='entity')
)]
    
property_tool = [Tool(
    name = 'PropertyLookup',
    description = 'Useful when you need to know the p-number for a property in order to generate a sparql request. \
        Provides a list of possible p-numbers.',
    func = lambda item: identifier_lookup(item, entity_type='property')
)]

def make_tool(template, tools, tool_name):
    def parsing(string):
        split = string.split(":")
        keyword, question = split[0], ':'.join(split[1:])
        id_prompt = PromptTemplate(
            input_variables=['input', 'question'],
            template = template
        )
        llm = ChatOpenAI(temperature=0.8, model="gpt-4", verbose=True)
        llm_chain = LLMChain(prompt=id_prompt, llm=llm, verbose = True)
        id_agent = initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose = True)

        return id_agent.run(id_prompt.format(input=keyword,question=question))
    
    tool_desc = 'Use when you need to know the uri for a taxon. \
        Return best extract URI in the context of the question '
    
    id_tool = Tool(
        name = tool_name,
        description = tool_desc,
        func = parsing
    )
    
    return id_tool

    


In [14]:
all_id_tools = make_tool(item_template, item_tool+property_tool, 'ItemTool')

## Tool : load schema (NO LONGER USED)

In [89]:
def remove_prefix(uri, prefix='https://enpkg.commons-lab.org/'):
    return uri.split(prefix, 1)[-1]

def condense_parse(entities):

    with open(entities, 'r') as e:
        entity_dict = json.load(e)
    parsed = {x:{y:entity_dict[x][y][0]['value'] for y in entity_dict[x]} for x in entity_dict}

    with open('./chemistry_schema/enpkg_predicates_clean.json', 'r') as p:
        pred_dict = json.load(p)

    new_dict = {}

    for key, value in parsed.items():
        new_value = {}
        try:
            if value.get('http://www.w3.org/1999/02/22-rdf-syntax-ns#type') == 'http://www.w3.org/2000/01/rdf-schema#Class':
                new_value['Type'] = 'Entity'
            else:
                new_value['Type'] = value.get('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
            new_value['Label'] = value.get('http://www.w3.org/2000/01/rdf-schema#label')
            new_value['Description'] = value.get('http://www.w3.org/2000/01/rdf-schema#comment')
        except:
            continue
        new_dict[remove_prefix(key)] = new_value

    new_dict = {x:y for x,y in new_dict.items() if y['Type']=='Entity'}

    for key, value in pred_dict.items():
        for pred in value:
            if 'https://enpkg.commons-lab.org/' in pred:
                new_dict[remove_prefix(pred)] = {'Type':'Predicate'}

    return new_dict

def condense_parse_local(entities):
    with open(entities, 'r') as e:
        entity_dict = json.load(e)['results']['bindings']
    parsed = {nested_value(x, ['o','value']):{'label': nested_value(x, ['label', 'value']), 'comment': nested_value(x, ['comment', 'value'])} for x in entity_dict}
    with open('./chemistry_schema/enpkg_local_predicates.json', 'r') as p:
        pred_dict = json.load(p)

    new_dict = {}

    for key, value in parsed.items():
        new_value = {}
        new_value['Type'] = 'Entity'
        try:
            new_value['Label'] = value.get('label')
            new_value['Description'] = value.get('comment')
        except:
            continue

        new_dict[remove_prefix(key)] = new_value


    new_dict = {x:y for x,y in new_dict.items() if y['Type']=='Entity'}

    for key, value in pred_dict.items():
        for pred in value:
            if 'https://enpkg.commons-lab.org/' in pred:
                new_dict[remove_prefix(pred)] = {'Type':'Predicate'}

    return new_dict

In [91]:
CondenseDB = split_data(condense_parse_local(schema_filepath))

Split the documents
You have 7 document(s) in your data


In [511]:
CondenseChunkDB = split_data(condense_parse_local(schema_filepath), chunk_size=100, chunk_overlap=10)

Split the documents
You have 15 document(s) in your data


In [537]:
CondenseClass = DBRetriever(schema_prompt, schema_filepath, schema_tool_name, schema_tool_desc, condense_parse)
CondenseClass.load_embeddings(CondenseDB)
CondenseClass.make_tool()

In [538]:
CondenseChunkClass = DBRetriever(schema_prompt, schema_filepath, schema_tool_name, schema_tool_desc, condense_parse)
CondenseChunkClass.load_embeddings(CondenseChunkDB)
CondenseChunkClass.make_tool(docs = 7)

In [63]:
def schema_parser(entities):
    with open(entities, 'r') as e:
        entity_dict = json.load(e)['results']['bindings']
    #parsed = {x:{y:entity_dict[x][y][0]['value'] for y in entity_dict[x]} for x in entity_dict}
    
    parsed = {nested_value(x, ['o','value']):{'label': nested_value(x, ['label', 'value']), 'comment': nested_value(x, ['comment', 'value'])} for x in entity_dict}
    with open('./chemistry_schema/enpkg_local_predicates.json', 'r') as p:
        pred_dict = json.load(p)


    new_dict = {}

    for key, value in parsed.items():
        new_value = {}
        try:
            if value.get('http://www.w3.org/1999/02/22-rdf-syntax-ns#type') == 'http://www.w3.org/2000/01/rdf-schema#Class':
                new_value['Type'] = 'Entity'
            else:
                new_value['Type'] = value.get('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
            new_value['Label'] = value.get('http://www.w3.org/2000/01/rdf-schema#label')
            new_value['Description'] = value.get('http://www.w3.org/2000/01/rdf-schema#comment')
        except:
            continue
        new_dict[remove_prefix(key)] = new_value

    new_dict = {x:y for x,y in new_dict.items() if y['Type']=='Entity'}

    for key, value in pred_dict.items():
        for pred in value:
            if 'https://enpkg.commons-lab.org/' in pred:
                new_pred = remove_prefix(pred)
                try:
                    new_dict[new_pred]['Property of Classes'].append(remove_prefix(key))
                except:
                    new_dict[new_pred] = {'Type':'Predicate', 'Property of Classes':[remove_prefix(key)]}
        
        try:
            new_dict[remove_prefix(key)]['Has predicates'] = [remove_prefix(pred) for pred in value]
        except:
            pass
    return new_dict

schema_template = """
Give me a list possible uris for entities AND predicates (not just predicates) relevant to this question: {question}. Only include if relevant.
The format of the document is a URI followed by a dictionary of information (the type, label, description)
"""

schema_prompt = PromptTemplate(
    input_variables=['question'], #'input', 
    template = schema_template
)

schema_tool_desc = 'Use when you need to know the relevant URIs for a question. \
        Example input: "what is the age of Obama?" '
schema_tool_name = 'IDTool'
schema_filepath = './chemistry_schema/enpkg_local_schema.json' #change for non-local version

In [522]:
LongDB = split_data(schema_parser(schema_filepath))

Split the documents
You have 7 document(s) in your data


In [523]:
LongChunkDB = split_data(schema_parser(schema_filepath), chunk_size=100, chunk_overlap=10)

Split the documents
You have 36 document(s) in your data


In [535]:
SchemaClass = DBRetriever(schema_prompt, schema_filepath, schema_tool_name, schema_tool_desc, schema_parser)
SchemaClass.load_embeddings(LongDB)
SchemaClass.make_tool()


In [536]:
SchemaChunkClass = DBRetriever(schema_prompt, schema_filepath, schema_tool_name, schema_tool_desc, schema_parser)
SchemaChunkClass.load_embeddings(LongChunkDB)
SchemaChunkClass.make_tool(docs=15)

In [526]:
import time

def get_uris(question, Class, doc_list):
    print(question)
    for d in doc_list:
        start_time = time.time()

        Class.make_tool(docs=d)

        print('Results for get documents number:', d)
        print(Class.retriever.run(schema_prompt.format(question=question)))
        end_time = time.time()
        execution_time = end_time - start_time
        print('Execution time:', execution_time, 'seconds')
        print('')


def get_uris2(question, Class_list, doc_list, filename):
    with open(filename, 'a') as f:
        f.write(question + '\n')
        for Class, doc in zip(Class_list, doc_list):
            start_time = time.time()

            Class.make_tool(docs=doc)

            f.write('Results for class with doc number: {}\n'.format(doc))
            f.write(str(Class.retriever.run(schema_prompt.format(question=question))) + '\n')
            end_time = time.time()
            execution_time = end_time - start_time
            f.write('Execution time: {} seconds\n'.format(execution_time))
            f.write('\n')



In [None]:
for q in [q1, q2, q3, q4, q5, q6, q7, q8]:
    get_uris(q, SchemaClass, [2,4,7])

In [690]:
for q in [q1, q2, q3, q4, q5, q6, q7, q8]:
    get_uris(q, CondenseChunkClass, [2,4,7,10])

## Tool: Schema Retriever

In [120]:
def retriever_schema(arg):
    with open('./local_files/merged.ttl', 'r') as file:
        ttl_schema = file.read()
    return ttl_schema

SchemaRetrieverTool = Tool(name = 'SchemaRetrieverTool',
                           description = 'Useful to get all triples of knowledge graph schema',
                           func = retriever_schema)

# Agent

## Prompt Selector

In [114]:
def asks_structure(question) -> bool:
    return 'structure' in question.lower() 

prompt_selector = ConditionalPromptSelector(
    default_prompt=prompts.default_prompt2, conditionals=[(lambda x: False, prompts.structure_prompt)]
)

def return_prompt(question, schema=ttl_schema):
    return prompt_selector.get_prompt(question).format(question = question) #, schema=schema

def run_agent_simple(question, schema=ttl_schema):
    prompts_dict = {'default':prompts.default_prompt2, 'structure':prompts.structure_prompt}
    if asks_structure(question):
        prompt = prompts_dict['structure']
        tools = [run_sparql_query.tool, taxon_agent.tool, target_agent.tool, structure_agent.tool, NPCClass.tool, SchemaRetrieverTool, Unit.tool]
    else:
        prompt = prompts_dict['default']
        tools = [run_sparql_query.tool, taxon_agent.tool, target_agent.tool, NPCClass.tool, SchemaRetrieverTool, Unit.tool]
    llm = ChatOpenAI(temperature=0.3, model="gpt-4", verbose=True)
    agent = initialize_agent(tools, llm, prompt = prompt_selector, agent=AgentType.OPENAI_FUNCTIONS, verbose = True)
    agent.run(prompt.format(question=question))

## Prompt with ttl file

In [20]:
prefix_map = {'http://schema.org/':'schema:',
              'https://enpkg.commons-lab.org/module/':'em:',
              'http://purl.org/pav/':'pav:',
              'http://example.org/':'example:',
              'https://enpkg.commons-lab.org/kg/':'e:',
              'http://purl.org/dc/terms/':'dcterms:',
              'http://xmlns.com/foaf/0.1/': 'foaf:',
              'http://proton.semanticweb.org/protonsys#':'proton:',
              'http://www.w3.org/2001/XMLSchema#': 'xsd:',
              'http://www.w3.org/2000/01/rdf-schema#':'rdfs:',
              'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf:',
              'http://www.w3.org/2002/07/owl#': 'owl:',
              'http://purl.org/vocab/vann/': 'vann:'}

def get_prefix(url):
    for prefix, substitution in prefix_map.items():
        if url.startswith(prefix):
            substituted_url = url.replace(prefix, substitution)
            return substituted_url
    return url


In [21]:
from rdflib import Graph


# load schema
with open('schema_local_ttl/merged.ttl', 'r') as file:
    ttl_schema = file.read()

# Create an empty RDF graph
graph = Graph()

# Parse the TTL schema string
graph.parse(data=ttl_schema, format='turtle')

# Retrieve the explicit triples as a list
triples = [(get_prefix(str(s)), get_prefix(str(p)), get_prefix(str(o))) for s, p, o in graph if get_prefix(str(o)) != '']

#### 1 Agent generate sparql

Prompt: simple_agent_prompt

Input: schema, question

Tools: simple_agent_tools

In [19]:
simple_agent_prompt = """
You are trying to generate a SPARQL query for the Experiment Natural Products Knowledge Graph (ENPKG) based on a natural language question. Here are your instructions:

1. Generate sparql query
2. Run sparql query with SparqlQueryRunner Tool


Note:
* DO NOT assume any identifiers 
* This is NOT the wikidata knowledge graph

Use the schema below for identifying relavent URIs and to understand instances of what classes or related to others. Here is the schema of ENPKG is .ttl format. The entities listed indicate the type of objects that the predicates relate.
{schema}

You can use this prefix (replace the start of the URI with the prefix):
PREFIX enpkg: <https://enpkg.commons-lab.org/kg/>
PREFIX enpkg_module: <https://enpkg.commons-lab.org/module/>

Question: {question}
"""

simple_agent_tools = [run_sparql_query.tool]

#### 1 Agent with Taxon and Chemical Class Tools

Prompt: option2_prompt

Input: schema, question

Tools: option2_tools

In [67]:
option2_tools = [run_sparql_query.tool, taxon_agent.tool, target_agent.tool, structure_agent.tool, NPCClass.tool, SchemaRetrieverTool]

#### Nested Agents with only generate sparql

Prompt: outer_prompt

Input: question

Tools: nested_agent_tools

In [21]:
nested_prompt = """
You are trying to generate a SPARQL query for the Experiment Natural Products Knowledge Graph (ENPKG) based on a natural language question. Here are your instructions:

Return generate SPARQL query with nothing else.


Note:
* DO NOT assume any identifiers 
* This is NOT the wikidata knowledge graph

Use the schema below for identifying relavent URIs and to understand instances of what classes or related to others. Here is the schema of ENPKG is .ttl format. The entities listed indicate the type of objects that the predicates relate.
{schema}

You can use this prefix (replace the start of the URI with the prefix):
PREFIX enpkg: <https://enpkg.commons-lab.org/kg/>
PREFIX enpkg_module: <https://enpkg.commons-lab.org/module/>

Question: {question}"""
nested_prompt = nested_prompt.format(schema=ttl_schema, question="{question}")
nested_prompt_template = PromptTemplate(
    template=nested_prompt,
    input_variables=["question"]
)

llm = ChatOpenAI(temperature=0, model="gpt-4")
llm_chain = LLMChain(prompt=nested_prompt_template, llm=llm)
generate_sparql = Tool(name = 'GenerateSPARQL',
                       description = 'Generate a sparql query given a question',
                       func=llm_chain)



In [22]:
outer_prompt = """Use the following two tools to generate a sparql query and run the query for the inputted question.
Tools:

1. GenerateSPARQL
2. SparqlQueryRunner

Question: {question}
"""

nested_agent_tools = [run_sparql_query.tool, generate_sparql]

#### Nested Agent with Taxon and Chemical tools outside generate sparql tool

Prompt: option3_outer_prompt

Input: question

Tools: option3_tools

In [23]:
option3_outer_prompt = """
You are trying to answer a question using the Experiment Natural Products Knowledge Graph (ENPKG). Here are your instructions:

1. ONLY IF a taxon is mentioned, use TaxonTool
2. ONLY IF a chemical class is mentioned used ClassTool
3. Use GenerateSPARQL tool after use of first two tools
4. Run sparql query with SparqlQueryRunner Tool

Question: {question}"""

option3_inner_prompt = """
Use the schema below for identifying relavent URIs and to understand instances of what classes or related to others. Here is the schema of ENPKG is .ttl format. The entities listed indicate the type of objects that the predicates relate.
{schema}

Use chat memory for taxon and chemical class information:
{chat_history} 

You can use this prefix (replace the start of the URI with the prefix):
PREFIX enpkg: <https://enpkg.commons-lab.org/kg/>
PREFIX enpkg_module: <https://enpkg.commons-lab.org/module/>

Question: {question}
"""

In [34]:
option3_prompt = option3_inner_prompt.format(schema=ttl_schema, question="{question}", chat_history="{chat_history}")
option3_prompt_template = PromptTemplate(
    template=option3_prompt,
    input_variables=["question", "chat_history"]
)

llm = ChatOpenAI(temperature=0, model="gpt-4")
memory = ConversationBufferMemory(memory_key="chat_history")
llm_chain = LLMChain(prompt=option3_prompt_template, llm=llm, memory=memory)
generate_sparql_memory = Tool(name = 'GenerateSPARQL',
                       description = 'Generate a sparql query given a question',
                       func=lambda x: llm_chain.predict(question=x))

option3_tools = [run_sparql_query.tool, generate_sparql_memory, taxon_class.tool, NPCClass.tool]


### Create final agent

In [513]:
all_tools = [generate_sparql, run_sparql_query.tool] #+ [taxon_agent] + [NPCClass.tool]
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
prompt = PromptTemplate(
    template=prompts.option2_prompt_template,
    input_variables=["question", "schema"]
)
llm = ChatOpenAI(temperature=0, model="gpt-4", verbose=True)
agent = initialize_agent(option2_tools, llm, prompt = prompt_selector, agent=AgentType.OPENAI_FUNCTIONS, verbose = True)

In [121]:
langchain.debug=False
with get_openai_callback() as cb:
    print(q1)
    run_agent_simple(q1)
    print(cb)

How many features (pos ionization and neg ionization modes) have the same SIRIUS/CSI:FingerID and ISDB annotation by comparing the InCHIKey of the annotations?


[1m> Entering new  chain...[0m
[32;1m[1;3m
Invoking: `SchemaRetrieverTool` with `How many features (pos ionization and neg ionization modes) have the same SIRIUS/CSI:FingerID and ISDB annotation by comparing the InCHIKey of the annotations?`


[0m[33;1m[1;3m@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix enpkg: <https://enpkg.commons-lab.org/kg/> .
@prefix enpkg_module: <https://enpkg.commons-lab.org/module/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix ns1: <http://proton.semanticweb.org/protonsys#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix vann: <http://purl.org/vocab/vann/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

owl:Ontology dcterms:contributor "" ;
    d

# Old Agent

In [540]:
all_tools = [run_sparql_query.tool] + [taxon_agent] + [SchemaClass.tool, NPCClass.tool]#+ tool_extract_keywords

In [542]:
general_template = """
You are trying to generate a SPARQL query for the Experiment Natural Products Knowledge Graph (ENPKG)based on a natural language question. Here are your instructions:

1. Use IDTool for identifying pertinent URIs by inputting the question.
2. Use ClassTool only if the question references a chemical class
3. Use TaxonTool only if a taxon is mentioned. Input is a colon seperated list of length 2 with taxon:question to get URIs if any chemical classes are mentioned. 
4. Run sparql query with SparqlQueryRunner
5. If no results first think about if the unknowns are of the expected rdfs:Class, otherwise tell the user how to improve their question 

Note:
* IMPORTANT: pass a colon seperated list of length 2 into TaxonTool
* DO NOT assume any identifiers 
* This is NOT the wikidata knowledge graph
* Pay attention to whether URI is type 'entity' or 'predicate' when generating the sparql

You can use this prefix (replace the start of the URI with the prefix):
PREFIX enpkg: <https://enpkg.commons-lab.org/kg/>
PREFIX enpkg_module: <https://enpkg.commons-lab.org/module/>

Question: {question}
"""

In [543]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
prompt = PromptTemplate(
    template=general_template,
    input_variables=["question"]
)
llm = ChatOpenAI(temperature=0, model="gpt-4")
agent = initialize_agent(all_tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose = True)

In [4]:
q1 = 'How many features (pos ionization and neg ionization modes) have the same SIRIUS/CSI:FingerID and ISDB annotation by comparing the InCHIKey of the annotations?'
q2 = 'Which samples have features (pos ionization mode) annotated as the class, aspidosperma-type alkaloids, by CANOPUS with a probability score above 0.5, ordered by the decresing count of features as aspidosperma-type alkaloids? Group by extract sample.'
q3 = 'Among the structural annotations from the Tabernaemontana coffeoides (Apocynaceae) seeds extract taxon , which ones contain an aspidospermidine substructure, CCC12CCCN3C1C4(CC3)C(CC2)NC5=CC=CC=C45?'
q3_og = 'Among the structural annotations from the Tabernaemontana coffeoides (Apocynaceae) seeds extract taxon , which ones contain an aspidospermidine substructure?'
q4 = 'Among the SIRIUS structural annotations from the Tabernaemontana coffeoides (Apocynaceae) seeds extract taxon, which ones are reported in the Tabernaemontana genus in Wikidata? Can use service <https://query.wikidata.org/sparql> to run a subquery to wikidata within the sparql query'
q5 = 'Which compounds annotated in the active extract of the Melochia umbellata taxon have activity against T. cruzi reported (in ChEMBL) by looking at the cosmic, zdiac and taxo scores?'
q6 = 'Which features have the most fragments and neutral losses in common with feature #1 from the aerial part in PI mode (the [M+H] ion of walterione G in this extract)?'
q7 = 'Filter the pos ionization mode features of the Melochia umbellata taxon annotated as [M+H]+ by SIRIUS to keep the ones for which a feature in neg ionization mode is detected with the same retention time (+/- 3 seconds) and a mass corresponding to the [M-H]- adduct (+/- 5ppm).'
q8 = 'For features from the Melochia umbellata taxon in pos ionization mode with SIRIUS annotations, get the ones for which a feature in neg ionization mode with the same retention time (+/- 3 seconds) has the same SIRIUS annotation by comparing the InCHIKey 2D.'

In [5]:
q9 = "How many extracts exist for each taxon (submitted taxon), and among these nodes, how many have cross-references to Wikidata (WD)?"
q10 = "Which features were annotated as 'Tetraketide meroterpenoids' by SIRIUS, and how many such features were found for each species and plant part?"
q11 = "What are all distinct taxons for the extracts in the knowledge graph?"
q12 = "What are the taxons, lab process and label (if one exists) for each sample?"
q13 = None
q14 = "What compounds from the Tabernaemontana coffeoides taxon or its siblings in Wikidata have sirius annotations and other features in our knowledge graph? Match based on molecular formula."
q15 = "Given a specific taxon, can we identify all the chemical compounds found in this taxon or its siblings in Wikidata, that also have associated mass spectrometry data and molecular formula annotations from SIRIUS? Returning only those that have matching molecular formulas."
q16 = "Count all the species per family in the collection"