In [1]:
# general python libs
import os
import sys
from SPARQLWrapper import SPARQLWrapper, JSON

# langchain
import langchain
from langchain import LLMChain
from langchain.prompts.prompt import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.agents import Tool, AgentType, initialize_agent

In [2]:
endpoint_url = "https://query.wikidata.org/sparql"
user_agent = "SPARQL_AI/%s.%s" % (sys.version_info[0], sys.version_info[1])


def run_sparql(query, 
               url='https://query.wikidata.org/sparql', 
               wikidata_user_agent = "SPARQL_AI/%s.%s" % (sys.version_info[0], sys.version_info[1])):
    sparql = SPARQLWrapper(url, agent=wikidata_user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    try:
        results = nested_value(sparql.query().convert(), ['results', 'bindings'])
        return results
    except:
        print('here')
        return 'Your query is invalid'

run_sparql_tool = Tool(
        name = "SparqlQueryRunner",
        func=run_sparql,
        description="Useful to run Sparql queries after keywords have been extracted and identifiers have been found."
    ) 

def nested_value(data, path):
    current = data
    for key in path:
        try:
            current = current[key]
        except:
            return None
    return current

In [3]:
def identifier_lookup(search, entity_type = "entity",
                 limit = 50, # max number for Wikidata API entity search
                 info_type = ['label', 'description'] # information to include with id
                ):    

    search = f"'{search}'"
    SPARQL_template = """
    SELECT DISTINCT ?entity ?entityLabel ?entityDescription
    WHERE {{
    {{
        {{?entity skos:altLabel {searchstr}@en .}}
        UNION
        {{?entity rdfs:label {searchstr}@en .}}  
        OPTIONAL {{
          ?entity schema:description ?entityDescription .
          FILTER (lang(?entityDescription) = "en")
        }}
        SERVICE wikibase:label {{
        bd:serviceParam wikibase:language "en" .
       }}
      }}
    }}
    """ 
    
    try:
        output = run_sparql(SPARQL_template.format(searchstr=search))


        formatted_data = {}

        for item in output:
            entity_id = item['entity']['value'].split('/')[-1]
            label = item['entityLabel']['value']
            description = item.get('entityDescription', {}).get('value', '')
            formatted_data[entity_id] = {'label': label, 'description': description}
        return formatted_data
            
    except:
        return "Sorry, I got an error. Please try again."

In [4]:
item_template = """
Find the best q-number or p-number for {input} using the tools available: \
1. ItemLookup

Return the best q/p-number with the label and description in context of this question: {question}
"""

property_template = """
Find the best p-number for {input} using the tools available: \
1. PropertyLookup

Return the best p-number with the label and description in context of this question: {question}. 

Note: don't answer the question.
"""

item_tool = [Tool(
    name = 'ItemLookup',
    description = 'Useful when you need to know the q-number for an item in order to generate a sparql request. \
        Provides a list of possible q-numbers.',
    func = lambda item: identifier_lookup(item, entity_type='entity')
)]
    
property_tool = [Tool(
    name = 'PropertyLookup',
    description = 'Useful when you need to know the p-number for a property in order to generate a sparql request. \
        Provides a list of possible p-numbers.',
    func = lambda item: identifier_lookup(item, entity_type='property')
)]

def make_tool(template, tools, tool_name):
    
    def parsing(string):
        keyword, question = string.split(":")
        #keyword, question = json.loads(string)
        id_prompt = PromptTemplate(
            input_variables=['input', 'question'],
            template = template
        )
        llm = ChatOpenAI(temperature=0.2, model="gpt-4", verbose=True)
        id_agent = initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose = True)

        return id_agent.run(id_prompt.format(input=keyword,question=question))
    
    if tool_name == 'ItemTool':
        tool_desc = 'Use when you need to know the q-number for an item. \
        Example input: ["age", "what is the age of Obama?"].'
    elif tool_name == 'PropertyTool':
        tool_desc = 'Use when you need to know the p-number for an item. \
        Example input: ["age", "what is the age of Obama?"].'
    
    id_tool = Tool(
        name = tool_name,
        description = tool_desc,
        func = parsing
    )
    
    return id_tool
    


In [5]:
item_lookup = make_tool(item_template, item_tool, 'ItemTool')
property_lookup = make_tool(property_template, property_tool, 'PropertyTool')
id_tools = [item_lookup, property_lookup]

In [6]:
all_id_tools = make_tool(item_template, item_tool+property_tool, 'ItemTool')

# Tool : extract keywords

In [8]:
template = """
You are an ontologist and your job is to identify keywords related to entities and properties for a given question.
Format: {{ "entities": ["<entity_1>", ...], "properties": ["<property_1>", ...] }} 
Note that all nouns should be singular and verbs in the present tense.

Example: 

Question: "Can you retrieve the title of all the journal articles published in Webology in 2021?" 
{{ "entities": ["journal article", "journal"], "properties": ["title", "publication date"] }}

Question: {question}
"""

tool_name = 'ExtractKeywords'
tool_desc = 'Useful for extracting entity and property keywords based on a question. Input to tool should be a question.'
model_name = "gpt-4"
temperature = 0.8
max_tokens = 1500

prompt_template = PromptTemplate(input_variables=['question'], template=template)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
        
# initiate llm
llm = ChatOpenAI(
        model_name=model_name,
        temperature=temperature,
        max_tokens=max_tokens,
        openai_api_key=OPENAI_API_KEY,

    )
        
        
        
llm_chain = LLMChain(llm=llm, prompt=prompt_template)
        
tool_extract_keywords = [
    Tool(
        func=llm_chain,
        description=tool_desc,
        name=tool_name,
    )]

# Build agent

In [9]:
all_tools = [all_id_tools]  + [run_sparql_tool] 

In [10]:
general_template = """
You are trying to generate a SPARQL query based on a natural language question. 
First extract keywords. Then for each keyword pass the string 'keyword:question' - where question is the original question
I pose - as input to ItemTool or PropertyTool. Then run sparql query with SparqlQueryRunner.If query is invalid or you
need to find more ids use ItemTool again


Note:
* IMPORTANT: pass a colon seperated list of length 2 into ItemTool
* DO NOT assume any q-numbers or p-numbers!!



Question: {question}
"""

In [11]:
prompt = PromptTemplate(
    template=general_template,
    input_variables=["question"]
)
llm = ChatOpenAI(temperature=0.2, model="gpt-4")
agent = initialize_agent(all_tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose = True)

In [13]:
langchain.debug=False
format_prompt = general_template.format(question='how many children does Bach?')
agent.run(format_prompt)

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "\nYou are trying to generate a SPARQL query based on a natural language question. \nFirst extract keywords. Then for each keyword pass the string 'keyword:question' - where question is the original question\nI pose - as input to ItemTool or PropertyTool. Then run sparql query with SparqlQueryRunner.If query is invalid or you\nneed to find more ids use ItemTool again\n\n\nNote:\n* IMPORTANT: pass a colon seperated list of length 2 into ItemTool\n* DO NOT assume any q-numbers or p-numbers!!\n\n\n\nQuestion: how many children does Bach?\n"
}
[32;1m[1;3m[llm/start][0m [1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "System: You are a helpful AI assistant.\nHuman: \nYou are trying to generate a SPARQL query based on a natural language question. \nFirst extract keywords. Then for each keyword pass the string 'keyword:question' - w

'Johann Sebastian Bach had 20 children.'

In [27]:
format_prompt = general_template.format(question='What is the Basketball-Reference.com NBA player ID of Hakeem Olajuwon?')
agent.run(format_prompt)



[1m> Entering new  chain...[0m
[32;1m[1;3m
Invoking: `ItemTool` with `Basketball-Reference.com NBA player ID:What is the Basketball-Reference.com NBA player ID of Hakeem Olajuwon?`


[0m

[1m> Entering new  chain...[0m
[32;1m[1;3m
Invoking: `PropertyLookup` with `Basketball-Reference.com NBA player ID`


[0m[33;1m[1;3m{'P2685': {'label': 'Basketball-Reference.com NBA player ID', 'description': 'identifier for a NBA player on the Basketball Reference website'}}[0m[32;1m[1;3mThe best p-number for Basketball-Reference.com NBA player ID is P2685. It is used as an identifier for a NBA player on the Basketball Reference website. This would be the appropriate property to use when looking for the Basketball-Reference.com NBA player ID of Hakeem Olajuwon.[0m

[1m> Finished chain.[0m
[36;1m[1;3mThe best p-number for Basketball-Reference.com NBA player ID is P2685. It is used as an identifier for a NBA player on the Basketball Reference website. This would be the appropriate

'The Basketball-Reference.com NBA player ID of Hakeem Olajuwon is "o/olajuha01".'

In [28]:
format_prompt = general_template.format(question='What is the NBA player ID of Hakeem Olajuwon?')
agent.run(format_prompt)




[1m> Entering new  chain...[0m
[32;1m[1;3m
Invoking: `ItemTool` with `NBA player ID:What is the NBA player ID of Hakeem Olajuwon?`


[0m

[1m> Entering new  chain...[0m
[32;1m[1;3m
Invoking: `PropertyLookup` with `NBA player ID`


[0m[33;1m[1;3m{}[0m[32;1m[1;3mI'm sorry, but I couldn't find a specific p-number for "NBA player ID". It might be under a different label or not exist in the current database.[0m

[1m> Finished chain.[0m
[36;1m[1;3mI'm sorry, but I couldn't find a specific p-number for "NBA player ID". It might be under a different label or not exist in the current database.[0m[32;1m[1;3m
Invoking: `ItemTool` with `Hakeem Olajuwon:What is the NBA player ID of Hakeem Olajuwon?`


[0m

[1m> Entering new  chain...[0m
[32;1m[1;3m
Invoking: `ItemLookup` with `Hakeem Olajuwon`


[0m[36;1m[1;3m{'Q273256': {'label': 'Hakeem Olajuwon', 'description': 'Nigerian-American basketball player'}}[0m[32;1m[1;3mThe best q-number for Hakeem Olajuwon is Q27325

"I'm sorry, but I couldn't find the NBA player ID for Hakeem Olajuwon in the current database. It might not be available or listed under a different label."

In [47]:
format_prompt = general_template.format(question='What are 10 country capitals in Africa?')
agent.run(format_prompt)




[1m> Entering new  chain...[0m
[32;1m[1;3m
Invoking: `ItemTool` with `country capitals:What are 10 country capitals in Africa?`


[0m

[1m> Entering new  chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? Yes
Action: ItemLookup
Action Input: country capitals[0m
Observation: [36;1m[1;3m{}[0m
Thought:[32;1m[1;3mThe ItemLookup tool did not return any results for "country capitals". It might be more effective to look up "capital" instead, as this is likely to be a more common term used in the database. 
Do I need to use a tool? Yes
Action: ItemLookup
Action Input: capital[0m
Observation: [36;1m[1;3m{'Q51929311': {'label': 'largest city', 'description': 'city which has the greatest population in a country'}, 'P36': {'label': 'capital', 'description': 'seat of government of a country, province, state or other type of administrative territorial entity'}, 'Q3916071': {'label': 'capital', 'description': 'Wikimedia disambiguation page'}, 'Q59154781': {'label': 'capital'

'Here are 10 country capitals in Africa:\n\n1. [Country Q3780537](http://www.wikidata.org/entity/Q3780537) - Capital [Q3492539](http://www.wikidata.org/entity/Q3492539)\n2. [Country Q79](http://www.wikidata.org/entity/Q79) - Capital [Q85](http://www.wikidata.org/entity/Q85)\n3. [Country Q114](http://www.wikidata.org/entity/Q114) - Capital [Q3870](http://www.wikidata.org/entity/Q3870)\n4. [Country Q115](http://www.wikidata.org/entity/Q115) - Capital [Q3624](http://www.wikidata.org/entity/Q3624)\n5. [Country Q117](http://www.wikidata.org/entity/Q117) - Capital [Q3761](http://www.wikidata.org/entity/Q3761)\n6. [Country Q258](http://www.wikidata.org/entity/Q258) - Capital [Q3926](http://www.wikidata.org/entity/Q3926)\n7. [Country Q258](http://www.wikidata.org/entity/Q258) - Capital [Q5465](http://www.wikidata.org/entity/Q5465)\n8. [Country Q258](http://www.wikidata.org/entity/Q258) - Capital [Q37701](http://www.wikidata.org/entity/Q37701)\n9. [Country Q262](http://www.wikidata.org/entity/Q

In [30]:
format_prompt = general_template.format(question='How old is the show, Africa?')
agent.run(format_prompt)



[1m> Entering new  chain...[0m
[32;1m[1;3m
Invoking: `ItemTool` with `old:How old is the show, Africa?`


[0m

[1m> Entering new  chain...[0m
[32;1m[1;3m
Invoking: `ItemLookup` with `old`


[0m[36;1m[1;3m{'Q29719345': {'label': 'so', 'description': 'protein-coding gene in the species Drosophila melanogaster'}}[0m[32;1m[1;3m
Invoking: `PropertyLookup` with `duration`
responded: {content}

[0m[33;1m[1;3m{'P2047': {'label': 'duration', 'description': 'length of time of an event or process'}, 'Q2199864': {'label': 'duration', 'description': 'physical quantity for describing the temporal distance between events'}, 'Q116951346': {'label': 'duration', 'description': 'JavaScript library'}, 'Q5316263': {'label': 'duration', 'description': 'the number of calendar periods from a project element execution starts to it is completed'}, 'Q2341537': {'label': 'duration', 'description': 'theory of time and consciousness posited by the French philosopher Henri Bergson'}, 'Q16038819':

'The show "Africa" is 10 years old.'