In [41]:
%pip install opensearch-dsl

[0mNote: you may need to restart the kernel to use updated packages.


In [42]:
import os
import json

from dotenv import load_dotenv

from langchain.prompts.prompt import PromptTemplate
from langchain.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate

from langchain.output_parsers.structured import StructuredOutputParser, ResponseSchema

from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain
from langchain_core.runnables import RunnablePassthrough

from pydantic import BaseModel, Field

load_dotenv()

index_name = "unflattened_social_listening"

username = os.environ["OPENSEARCH_USERNAME"]
password = os.environ["OPENSEARCH_PASSWORD"]
domain_endpoint = os.environ["OPENSEARCH_DOMAIN_ENDPOINT"]

In [43]:
with open('mapping.json', 'r') as file:
    data = json.load(file)

mapping_string = json.dumps(data, indent=4)

In [44]:
from pydantic import BaseModel, Field
from typing import List, Optional

class Trait(BaseModel):
    Name: str = Field(description="Name of the trait, indicating the nature of the trait, like 'SYMPTOM', 'DIAGNOSIS', etc.")
    Score: float = Field(description="A score representing the confidence or relevance of the trait.")

class Attribute(BaseModel):
    Type: str = Field(description="Type of the attribute, e.g., 'QUALITY'.")
    Score: float = Field(description="A score representing the confidence or relevance of the attribute.")
    RelationshipScore: float = Field(description="Score representing the strength of the relationship with the main entity.")
    RelationshipType: str = Field(description="Type of the relationship, e.g., 'QUALITY'.")
    Id: int = Field(description="Unique identifier of the attribute.")
    BeginOffset: int = Field(description="Starting position of the attribute text in the original text.")
    EndOffset: int = Field(description="Ending position of the attribute text in the original text.")
    Text: str = Field(description="Text of the attribute.")
    Category: str = Field(description="Category of the attribute, e.g., 'MEDICAL_CONDITION'.")
    Traits: List[Trait] = Field(default_factory=list, description="List of traits associated with the attribute.")

class Entity(BaseModel):
    Id: int = Field(description="Unique identifier of the entity.")
    BeginOffset: int = Field(description="Starting position of the entity text in the original text.")
    EndOffset: int = Field(description="Ending position of the entity text in the original text.")
    Score: float = Field(description="A score representing the confidence or relevance of the entity.")
    Text: str = Field(description="Text of the entity.")
    Category: str = Field(description="Category of the entity, e.g., 'MEDICAL_CONDITION'.")
    Type: str = Field(description="Type of the entity, e.g., 'DX_NAME'.")
    Traits: List[Trait] = Field(default_factory=list, description="List of traits associated with the entity.")
    Attributes: Optional[List[Attribute]] = Field(default_factory=list, description="List of attributes associated with the entity, if any.")

class Source(BaseModel):
    text: str = Field(description="Original text of the source data.")
    sentiment: str = Field(description="Sentiment expressed in the text, e.g., 'NEGATIVE'.")
    entities: List[Entity] = Field(default_factory=list, description="List of entities detected in the text.")
    tag: str = Field(description="Tag associated with the text, e.g., a condition like 'asthma'.")
    does_it_switch: bool = Field(description="Boolean flag indicating if there is a switch in the context or state.")
    age_on_set_match: bool = Field(description="Boolean flag indicating if the age onset matches certain criteria.")
    has_side_effects: bool = Field(description="Boolean flag indicating if there are any side effects mentioned.")


In [46]:
response_schema = [
    ResponseSchema(
        name="query",
        description="""Extract and paste the query in the response. Only the query should be in the response. This output will be used to search the database.
          """,
        type="str"
    ),
    ResponseSchema(
        name="explanation",
        description="Enter any additional context that you would like to add to the query.",
        type="str"
    )
]

output_parser = StructuredOutputParser.from_response_schemas(response_schema)
format_instructions = output_parser.get_format_instructions()

In [47]:
from langchain.output_parsers.pydantic import PydanticOutputParser

mapping_parser = PydanticOutputParser(pydantic_object=Source)
mapping_instructions = mapping_parser.get_format_instructions()

In [50]:
chat_prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template(""""The GPT is designed to generate queries for retrieving data from elastic search based on the given Elasticsearch index schema. 
                                                 It interprets the provided schema, understands the structure and types of data stored within,
                                                  and crafts precise queries to efficiently extract the relevant information.
                                                  The GPT asks clarifying questions if the schema details are ambiguous or incomplete, 
                                                 aiming to ensure the generated queries are accurate and tailored to the user's needs. 
                                                 It prioritizes clear and concise communication,
                                                  explaining the logic behind each query and offering tips for optimizing search results within the constraints of
                                                  OpenSearch's capabilities.

                                                   The index contains the following fields in the following format {mapping_instructions}

                                                   The query is: {query}
                                                 
                                                 output in the format instructions.
                                                 {format_instructions}

                                                   """)
    ],
    input_variables=["query"],
    partial_variables={"mapping_instructions": mapping_instructions, "format_instructions": format_instructions}
)




In [52]:
chat_model = ChatOpenAI()

chain = (
    {'query': RunnablePassthrough()} |
    chat_prompt | chat_model | output_parser
)

result = chain.invoke("Give me all negative posts related to asthma")


In [57]:
result['query']

{'bool': {'must': [{'match': {'sentiment': 'NEGATIVE'}},
   {'match': {'tag': 'asthma'}}]}}

In [94]:
import os
from opensearchpy import OpenSearch, RequestsHttpConnection
from langchain.agents import tool

username = os.getenv("OPENSEARCH_USERNAME", "admin")
password = os.getenv("OPENSEARCH_PASSWORD", "admin")
domain_endpoint = os.getenv("OPENSEARCH_DOMAIN_ENDPOINT", "https://search-socialbi-poc-yurgfqanwzg4tiu4rrbo3avxn4.us-east-2.es.amazonaws.com")

class OpenSearchClient:
    def __init__(self, index_name: str, username: str = username, password: str = password, domain_endpoint: str=domain_endpoint):
        self.client = OpenSearch(
            hosts=[domain_endpoint],
            http_auth=(username, password),
            connection_class=RequestsHttpConnection,
            use_ssl=True,
            verify_certs=True,
        )
        self.index_name = index_name

    @tool
    def execute_query_tool(self, query: str) -> dict:
        """Run a query on the OpenSearch index, and returns the result."""
        return self.client.search(index=self.index_name, body=query)
    
    def execute_query(self, query):
        return self.client.search(index=self.index_name, body=query)
    


In [95]:
es_client = OpenSearchClient(index_name="unflattened_social_listening")

In [96]:
es_results = es_client.execute_query({'query': result['query']})

In [97]:
es_results['hits']

{'total': {'value': 94, 'relation': 'eq'},
 'max_score': 2.449619,
 'hits': [{'_index': 'unflattened_social_listening',
   '_id': 't_6crIsBuUee1_Sg747E',
   '_score': 2.449619,
   '_source': {'text': 'It wasn\'t like that for me.   It just made my dreams vivid and fragmented. There was no narrative to them, they were a "mash-up" of things that didn\'t go together.   I still knew they were dreams - no way did a squirrel make itself a sandwich in real life, I don\'t think - and they weren\'t upsetting.   Zero side effects now.   I cannot say that for my other asthma medicines.',
    'sentiment': 'NEGATIVE',
    'entities': [{'Id': 2,
      'BeginOffset': 308,
      'EndOffset': 320,
      'Score': 0.6257612705230713,
      'Text': 'side effects',
      'Category': 'MEDICAL_CONDITION',
      'Type': 'DX_NAME',
      'Traits': [{'Name': 'SYMPTOM', 'Score': 0.5281373262405396}]},
     {'Id': 1,
      'BeginOffset': 359,
      'EndOffset': 375,
      'Score': 0.7353901267051697,
      'Text'

In [109]:
from langchain.agents import AgentExecutor, tool
from langchain.agents.output_parsers import XMLAgentOutputParser

@tool
def es_tool(query: str) -> dict:
    """Run a query on the OpenSearch index, and returns the result."""
    return es_client.execute_query(query)

tool_list = [es_tool]

def convert_intermediate_steps(intermediate_steps):
    log = ""
    for action, observation in intermediate_steps:
        log += (
            f"<tool>{action.tool}</tool><tool_input>{action.tool_input}</tool_input><observation>{observation}</observation>"
        )
    return log

def convert_tools(tools):
    print(tools)
    return "\n".join([f"{tool.name}: {tool.description}" for tool in tools])

convert_tools(tool_list)

[StructuredTool(name='es_tool', description='es_tool(query: str) -> dict - Run a query on the OpenSearch index, and returns the result.', args_schema=<class 'pydantic.v1.main.es_toolSchemaSchema'>, func=<function es_tool at 0x15007f560>)]


'es_tool: es_tool(query: str) -> dict - Run a query on the OpenSearch index, and returns the result.'

In [110]:
from langchain import hub

prompt = ChatPromptTemplate(
    input_variables=['agent_scratchpad', 'input', 'tools', 'opensearch_schema'], 
    partial_variables={'chat_history': ''}, 
    messages=
    [HumanMessagePromptTemplate
     (prompt=PromptTemplate(
         input_variables=['agent_scratchpad', 'chat_history', 'input', 'tools'], 
         template="""The GPT is designed to generate queries for retrieving data from elastic search based on the given Elasticsearch index schema. 
                                                 It interprets the provided schema, understands the structure and types of data stored within,
                                                  and crafts precise queries to efficiently extract the relevant information.
                                                  The GPT asks clarifying questions if the schema details are ambiguous or incomplete, 
                                                 aiming to ensure the generated queries are accurate and tailored to the user's needs. 
                                                 It prioritizes clear and concise communication,
                                                  explaining the logic behind each query and offering tips for optimizing search results within the constraints of
                                                  OpenSearch's capabilities.

                                                   The index contains the following fields in the following format {opensearch_schema}
.
         \n\nYou have access to the following tools:
         \n\n{tools}\n\nIn order to use a tool, you can use <tool></tool> and <tool_input></tool_input> tags.
           You will then get back a response in the form <observation></observation>\n
           For example, if you have a tool called 'search' that could run a google search, in order to search for the weather in SF you would respond:
           \n\n<tool>search</tool><tool_input>weather in SF</tool_input>\n<observation>64 degrees</observation>\n\n
           When you are done, respond with a final answer between <final_answer></final_answer>. For example:\n\n<final_answer>
           The weather in SF is 64 degrees</final_answer>\n\nBegin!\n\nPrevious Conversation:\n{chat_history}\n\nQuestion: {input}\n{agent_scratchpad}
           addi
           """))])

In [111]:
agent = (
    {
        "input": lambda x: x["input"],
        "agent_scratchpad": lambda x: convert_intermediate_steps(
            x["intermediate_steps"]
        )
    }
    | prompt.partial(tools=convert_tools(tool_list), opensearch_schema=mapping_instructions)
    | chat_model.bind(stop=["</tool_input>", "</final_answer>"])
    | XMLAgentOutputParser()
)

[StructuredTool(name='es_tool', description='es_tool(query: str) -> dict - Run a query on the OpenSearch index, and returns the result.', args_schema=<class 'pydantic.v1.main.es_toolSchemaSchema'>, func=<function es_tool at 0x15007f560>)]


In [112]:
agent_executor = AgentExecutor(agent=agent, tools=tool_list, verbose=True)

In [114]:
agent_executor.invoke({"input":"Give me all negative posts related to asthma"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m<tool>es_tool</tool><tool_input>{"query": {"bool": {"must": [{"match": {"sentiment": "NEGATIVE"}}, {"match": {"tag": "asthma"}}]}}}[0m[36;1m[1;3m{'took': 154, 'timed_out': False, '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 94, 'relation': 'eq'}, 'max_score': 2.449619, 'hits': [{'_index': 'unflattened_social_listening', '_id': 't_6crIsBuUee1_Sg747E', '_score': 2.449619, '_source': {'text': 'It wasn\'t like that for me.   It just made my dreams vivid and fragmented. There was no narrative to them, they were a "mash-up" of things that didn\'t go together.   I still knew they were dreams - no way did a squirrel make itself a sandwich in real life, I don\'t think - and they weren\'t upsetting.   Zero side effects now.   I cannot say that for my other asthma medicines.', 'sentiment': 'NEGATIVE', 'entities': [{'Id': 2, 'BeginOffset': 308, 'EndOffset': 320, 'Score': 0.6257612705

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 4097 tokens. However, your messages resulted in 7772 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}