In [1]:
# Para buscar tablas por busqueda semantica
import sys
sys.path.append("C:\\Users\\lauth\\OneDrive\\Desktop\\sql_assistant_v3")
from experiments.experiments_settings.settings import Experiments_Settings
from src.components.models.embeddings.embeddings import HF_MultilingualE5_Embeddings
from src.components.models.llms.llms import HF_Llama38b_LLM

llm = HF_Llama38b_LLM()
embeddings = HF_MultilingualE5_Embeddings()

llm.init_model()
embeddings.init_model()

openai_collection = Experiments_Settings.Chroma.get_experiments_with_openai_collection()
llama_collection = Experiments_Settings.Chroma.get_experiments_with_llama_collection()

In [11]:
# Llama 3 prompt template


# !Chat template de llama3
def apply_llama3_chat_template(instruction, instruction_suffix):
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Your are a very helpfull assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
{instruction}<|eot_id|><|start_header_id|>
assistant<|end_header_id|>
{instruction_suffix}"""


# !Identificar terminos tecnicos prompt

def get_technical_terms_template(user_request):

    t = """I want to create a dictionary but I need your help to find all possible technical, ambiguous or unknowing terms in the sentence, follow carefully the next steps:
First, take your time and read carefully the next sentence:
sentence: '''{user_request}'''

Second, extract technical, ambiguous or unknowing terms.

Third, look at the sentence again and find possible rare names or tags.

Fourth, create a list with extracted terms and erase known common terms that are not related to systems measuring.

Note: Do not create new terms, use only the terms in sentence. You can also use compound words. Do not cut or separate a single word into several words.

Output format response:
The output should be formatted with the key format below. Do not add anything beyond the key format.
Start Key format:
key: "terms"
content: Comma separated list of terms.
End of Key format"""

    return apply_llama3_chat_template(t.format(user_request=user_request), "terms:")


# !Prompt para clasificar si tiene multi definicion
def complete_multi_definition_prompt_with_llama3(user_request, terms_arr):
    multi_definition_prompt = """Your task is to classify the sentence into clear or unclear, follow carefully the next steps:

First, look up the technical terms found in the sentence:
Technical terms: '''{technical_terms}'''

Second, look up the next definitions and evaluate if there are multi definitions for each of technical terms shown before.
Definitions:
{definitions}
Third, evaluation. To classify pick every term and answer the question: what is human refering with this term?
If there is single definition for a term, then is clear, but if there are many definitions and with sentence context still making ambiguity, then is unclear. On the other hand, if there are many definitions and with sentence context human refers to specific definition, then is clear.

Output format response:
The output should be formatted with the key format below. Do not add anything beyond the key format.
Start Key format:
key: "class"
content: clear/unclear.
key: "analysis"
content: brief analysis.
End of Key format
Begin!
sentence: '''{sentence}'''"""

    definitions = ""
    technical_terms_arr = list()
    for _, item in enumerate(terms_arr):
        technical_terms_arr.append(item["original_term"])
        definitions += f"""For term '{item["original_term"]}'\n"""
        for _, inner_definition in enumerate(item["definitions"]):
            definitions += f"""- {inner_definition["definition"]}\n"""
        definitions += "\n"

    technical_terms = ", ".join(technical_terms_arr)

    instruction = multi_definition_prompt.format(
        technical_terms=technical_terms, definitions=definitions, sentence=user_request
    )
    return apply_llama3_chat_template(instruction, "class:")


# !Prompt para reemplazar los terminos tecnicos por palabras más entendibles para la busqueda semantica
def replace_terms_prompt_with_llama3(user_request, terms_arr):
    replace_terms_prompt = """The next is a request sentence:
sentence: '''{sentence}'''

Instructions:
Your task is to find the next list of technical terms in sentence and replace them if it is necessary with correct words for a better comprehension. 

Technical terms: '''{technical_terms}'''

Suggestions for replacing:
{replace_instructions}
Note: Pay attention to the recommendations. When you find a term that needs to be replaced use the suggested one.

Output format response:
The output should be formatted with the key format below. Do not add anything beyond the key format.
Start Key format:
key: "modified_sentence"
content: Modified sentence.
End of Key format

Begin!"""

    replace_instructions = ""
    technical_terms_arr = list()
    for _, item in enumerate(terms_arr):
        if len(item["definitions"]) > 0:
            technical_terms_arr.append(item["original_term"])
            replace_instructions += f"""For term '{item["original_term"]}'\n"""
            for _, inner_definition in enumerate(item["definitions"]):
                replace_instructions += (
                    f"""- {inner_definition["replace_instruction"]}\n"""
                )
            replace_instructions += "\n"

    technical_terms = ", ".join(technical_terms_arr)

    instruction = replace_terms_prompt.format(
        technical_terms=technical_terms,
        replace_instructions=replace_instructions,
        sentence=user_request,
    )

    return apply_llama3_chat_template(instruction, "modified_sentence:")


# !Prompt para generar una pregunta que pida más datos al usuario
def get_multi_definition_question_prompt(user_request, analysis):
    instruction = """Your task is to generate a question about what should be given by user to make his request clear and reduce ambiguity, follow carefully the next steps:

First, look up the user request:
user_request: '''{user_request}'''

Second, read carefully the next analysis why is this request determinated as "unclear":
analysis: '''{analysis}'''

Third, evaluation. Before you generate a question, evaluate what is necessary to ask the user and what needs to be given by user to be a clear sentence based on the previous analysis.

Note: Don't forget to provide some options so the user can easily choose.

Output format response:
The output should be formatted with the key format below. Do not add anything beyond the key format.
Start Key format:
key: "question"
content: Generated question that will be asked to the user.
key: "analysis"
content: brief analysis in one line.
End of Key format

Begin!"""

    return apply_llama3_chat_template(
        instruction.format(user_request=user_request, analysis=analysis), "question:"
    )


# !Prompt para completar el user request
def complete_question_prompt_with_llama3(user_request, ai_question, user_response):
    instruction = """Your task is to improve the user request according to a selected option:

First, look up the sentence request (This is what you will modify):
sentence_request: '''{user_request}'''

Second, read carefully the next messages between AI and User:
ai_question: '''{ai_question}'''
user_response: '''{user_response}'''

Third, evaluation. Improve the previous request according to the user information given in his response.

Note: Try not to change all the sentence request, use user response information to complete the request.

Output format response:
The output should be formatted with the key format below. Do not add anything beyond the key format.
Start Key format:
key: "modified_sentence"
content: Modified sentence request, should start with: "The human is ...".
key: "analysis"
content: brief analysis in one line.
End of Key format

Begin!"""

    return apply_llama3_chat_template(
        instruction.format(
            user_request=user_request,
            ai_question=ai_question,
            user_response=user_response,
        ),
        "modified_sentence:",
    )
    

In [3]:
# Para buscar los terminos tecnicos en bases de datos vectorial

from src.components.models.models_interfaces import Base_Embeddings
from experiments.db.handlers.handlers import query_by_vector_embedding
from src.utils.utils import clean_technical_term, clean_sentence

def find_technical_terms_definitions(embeddings: Base_Embeddings, technical_terms_arr, collection):
    tables = [
        "teq_clasificacion",
        "teq_tipo_equipo",
        "equ_equipo",
        "med_tag",
        "pla_plataforma",
        "med_sistema_medicion",
        "fcs_computador_medidor",
        "fcs_firmware",
        "var_tipo_variable",
        "var_variable_datos",
        "flu_tipo_fluido",
        "fcs_computadores",
        "fcs_tipo_computador",
        "med_tipo_medicion",
    ]
    terms_arr = list()
    has_replacement_definitions = False
    sql_instructions = False
    
    for term in technical_terms_arr:
        original_term = term
        cleaned_term = clean_technical_term(clean_sentence(original_term))

        vector = embeddings.get_embeddings(cleaned_term)

        definitions = list()
        for table in tables:
            r = query_by_vector_embedding(
                collection=collection,
                vector_embedding=vector,
                n=1,
                score_threshold=0.85,
                metadata_filters={"meta_table_name": {"$eq": table}},
            )
            for item in r:
                if item[2][1].strip():
                    definitions.append(
                        {
                            "standard_term": item[1][1],
                            "definition": item[2][1],
                            "replace_instruction": item[3][1],
                            # "distance": item[4][1],
                        }
                    )
                    has_replacement_definitions = True if item[3][1].strip() else has_replacement_definitions
                    # sql_instructions = True if item[4][1].strip() else sql_instructions

        if definitions:
            terms_arr.append(
                {
                    "original_term": original_term,
                    "cleaned_term": cleaned_term,
                    "definitions": definitions,
                }
            )

    return terms_arr, has_replacement_definitions, sql_instructions

In [12]:
# Procesando el output de multidefinition
from src.components.models.models_interfaces import Base_LLM
from src.utils.utils import txt_2_Json

def process_multidefinition_class(llm: Base_LLM, user_request, llama_response):
    is_unclear = llama_response["class"]
    is_unclear_analysis = llama_response["analysis"]
    
    is_unclear = str(is_unclear).lower() == "unclear"
    if is_unclear:
        output = llm.query_llm(get_multi_definition_question_prompt(user_request, is_unclear_analysis))
        output = txt_2_Json(output)
        
        user_response = input(output["question"])
        
        output = llm.query_llm(complete_question_prompt_with_llama3(user_request, output["question"], user_response))
        output = txt_2_Json(output)
        
        return output["modified_sentence"]
    return user_request

In [None]:
user_request = "The human is requesting information about EMED-321-2.1-A"

process_multidefinition_class(llm, user_request, {"class": "unclear", "analysis": "The term 'EMED-321-2.1-A' has multiple definitions, which are related to measurement systems, flow computers, and firmwares/types. Without more context, it\'s unclear what the human is referring to with this term, as it could be any one of those definitions. The request for information about it only adds to the ambiguity, making it uncertain what the human is looking for."})

In [18]:
from src.utils.utils import txt_2_Json, string_2_array
from src.components.models.models_interfaces import Base_LLM, Base_Embeddings

def modified_sentence_for_semantic_search(llm: Base_LLM, embeddings: Base_Embeddings,user_request, collection):
    # !Identificando terminos tecnicos
    technical_terms_instruction = get_technical_terms_template(user_request)
    output = llm.query_llm(technical_terms_instruction)
    output = txt_2_Json(output)
    output["terms"] = string_2_array(output["terms"])
    
    print("Technical terms")
    print(output)
    
    # !Buscamos definiciones en la bd vectorial
    terms_arr, has_replacement_definitions, _ = find_technical_terms_definitions(embeddings, output["terms"], collection)
    print(terms_arr, "\n\n")
    
    if has_replacement_definitions:
        # Identificando posible multidefinicion y claridad de la consulta
        multi_definitions_instruction = complete_multi_definition_prompt_with_llama3(user_request, terms_arr)
        output = llm.query_llm(multi_definitions_instruction)
        output = txt_2_Json(output)
        print("\n\nMulti definitions")
        print(output)
        
        # Verificando si el request no es ambiguo despues de ver las definiciones y multi definiciones
        user_request = process_multidefinition_class(llm, user_request, output)
        # print("\n\nuser_request")
        # print(user_request, "\n\n")
        
        # Reemplazando los terminos especiales por estandares
        replace_terms_instruction = replace_terms_prompt_with_llama3(user_request, terms_arr)
        output = llm.query_llm(replace_terms_instruction)
        output = txt_2_Json(output)
        return output["modified_sentence"]
        
    else:
        # print("\n\nNo hay definiciones técnicas para reemplazar")
        # print(user_request)
        return user_request
   

In [None]:
user_request = "The human is requesting information about measurement system with tag 'EMED-321-2.1-A'"

user_request = "The human is requesting a list of measurement systems with the tag 'EMED' that use Natural Gas, have 1 meter each, and are associated with flow computers."
user_request = "The human is asking for information about the platforms in the database."
user_request = 'The human is asking about the measurement systems that are capable of reading diferential gas.'

new_sentence = modified_sentence_for_semantic_search(llm, embeddings, user_request, llama_collection)
print("\n\n previous request:\n", user_request)
print("\n\n modified request:\n", new_sentence)

In [None]:
asd = [{'original_term': 'measurement systems', 'cleaned_term': 'measurement system', 'definitions': [{'standard_term': 'equipment type classification', 'definition': "The term 'terciary meter' commonly refers to an equipment type classification.", 'replace_instruction': "Look up the term 'terciary meter' and replace with 'equipment type classificaton' instead."}, {'standard_term': 'measurement system', 'definition': 'It is a set of measurement units, meters and flow computers that is responsible for measuring variables such as temperature, pressure of fluids.', 'replace_instruction': 'No replace term for this term.'}]}, {'original_term': 'orifice plate', 'cleaned_term': 'orifice plate', 'definitions': [{'standard_term': 'equipment type', 'definition': "The term 'orifice plate' commonly refers to an equipment type.", 'replace_instruction': "Look up the term 'orifice plate' or similar term and replace with 'equipment type' instead."}]}, {'original_term': 'Cexis platform', 'cleaned_term': 'cexis platform', 'definitions': [{'standard_term': 'platform', 'definition': "The term 'Cexis', refers to the to the name of the location of a platform, in a platform you can find meters, measurement systems, flow computers, platform clients, storages.", 'replace_instruction': "The term 'Cexis' refers to a platform, use 'platform' instead of Cexis."}]}]
asd[0]
multi_definitions_instruction = complete_multi_definition_prompt_with_llama3(user_request, asd)
print(multi_definitions_instruction)


In [None]:
p = '''According to this tables descriptions:
Platforms/installations table: This table contains information about platforms.
Database table name: pla_plataforma

Equipment type table: This table contains the different equipment types. When a equipment is registered a equipment type is asigned. Equipment types as: orifice plate, differencial pressure transmiter, static pressure transmiter, temperature transmiter, ultrasonic meter, coriolis meter, cone meter.
Database table name: teq_tipo_equipo

Equipments table: This table contains information of equipments registered as their serial, equipment type id, manufacturer id or registered platform.
Database table name: equ_equipo

Measurement system table: This table contains information about measuremets systems as their names, status, read fluid type, the read sub fluid type, location and measurement systems tag.
Database table name: med_sistema_medicion

Assigned equipments table: This table contains information for every equipment assigned to a measurement system. When an equipment is registered, it could be assigned to a measurement system or dropped in storage.
Database table name: med_tag

Equipment Type Classification table: This table contains the classification information for equipment types. Those can be classificated in: Primary meter, secundary meter, thrid meter or valves.
Database table name: teq_clasificacion

Use this relationships descriptions to find the most related tables to the next user request:
 - Every measurement system is located in Platform/installments
 - Equipment types have classification
 - Equipment are assigned to an measurement system
 - Equipments use equipment types

Your task is to pick the most related tables according to the next user request:
user_request: The human is requesting a list of measurement systems that have at least one orifice plate in the Cexis platform.

Extra definitions, use this extra definitions to have a context of some terms:
- The term 'terciary meter' commonly refers to an equipment type classification.
- It is a set of measurement units, meters and flow computers that is responsible for measuring variables such as temperature, pressure of fluids.
- The term 'orifice plate' commonly refers to an equipment type.
- The term 'Cexis', refers to the to the name of the location of a platform, in a platform you can find meters, measurement systems, flow computers, platform clients, storages.

Note: Yoy have to answer with database table name instead of the name of the table.

Follow the next key format to answer:
tables: Comma separated list of selected data table names.

Begin!'''
a = apply_llama3_chat_template(p, "tables: ")
r = query_llama3(a)
print(r)