# SQLQueryChain - Mondial - GPT 4


In [None]:

from langchain.chat_models import ChatOpenAI
from langchain.chains import create_sql_query_chain
from urllib.parse import quote  
from langchain.callbacks import get_openai_callback

from dotenv import load_dotenv
import os
import sys
import json
import time
load_dotenv()

experiment_path = '../..'
path = os.path.abspath('')
module_path = os.path.join(path, experiment_path)
print(module_path)
if module_path not in sys.path:
    sys.path.append(module_path+"/functions")


from sqldatabase_langchain_utils import SQLDatabaseLangchainUtils


# Schema

In [None]:
# SCHEMA = 'mondial_gpt'
# PREFIX = 'mondial'

# FILE_NAME_RESULT = f"results/12_sql_queries_chatgpt4_{SCHEMA}_fk.json"

SCHEMA = 'shipment'
PREFIX = 'shipment'
FILE_NAME_RESULT = f"results/12_sql_queries_gpt4_shipment.json"


In [None]:
def save_queries(queries):
    data = {"queries":queries}
    with open(FILE_NAME_RESULT, "w") as arquivo_json:
        json.dump(data, arquivo_json, indent=4) 
        
def read_queries():
    with open(FILE_NAME_RESULT, encoding='utf-8', errors='ignore') as json_data:
        data = json.load(json_data, strict=False)
    queries = data["queries"]
    return queries



## Conexão

In [None]:
json_file_path = f"../../datasets/{SCHEMA}_db_connection.json"
with open(json_file_path, encoding='utf-8', errors='ignore') as json_data:
    db_connection = json.load(json_data, strict=False)

db_connection

### Utilizando o SQLDatabase para pegar todas as informações do database

In [None]:
db = SQLDatabaseLangchainUtils(db_connection=db_connection)

# exclusao = [
#     f"{SCHEMA}_tmdp",
#     f"{SCHEMA}_tmdpmap",
#     f"{SCHEMA}_tmds",
#     f"{SCHEMA}_tmjmap",
#     f"{SCHEMA}_tpv",
#     f"{SCHEMA}_tmdc",
#     f"{SCHEMA}_tmdcmap",
#     f"{SCHEMA}_tmdej",
#     f"{SCHEMA}_log_action",
#     f"{SCHEMA}_log_error",
#     f"{SCHEMA}_favorite_item", 
#     f"{SCHEMA}_favorite_query",
#     f"{SCHEMA}_favorite_tag",
#     f"{SCHEMA}_favorite_tag_item",
#     f"{SCHEMA}_favorite_visualization",
#     f"{SCHEMA}_dashboard",
#     f"{SCHEMA}_history",
#     "teste_cliente",
#     "teste_fornecedor",
#     "teste_funcionario"
# ]

include_tables = db.get_table_names()

# include_tables = [s for s in db.get_table_names() if not s.startswith(PREFIX) and s not in exclusao]
db = SQLDatabaseLangchainUtils(db_connection=db_connection, include_tables=include_tables)
db.get_table_names()

In [None]:
len(db.get_table_names())

## Criando o prompt

In [None]:
from langchain.prompts.prompt import PromptTemplate

f = open(f"prompts/prompt_template_sql_query_chain.txt", "r")
prompt_template = f.read()
f.close()


PROMPT = PromptTemplate(
    input_variables=["input", "table_info", "top_k"], template=prompt_template
)

print(PROMPT)

## Criando o Chain para gerar SQL

In [None]:
query_chain  = create_sql_query_chain(ChatOpenAI(temperature=0, model_name='gpt-4'), db.db, prompt=PROMPT)
query_chain 



## Preparando as consultas em linguagem natural para rodar no LLM

In [None]:

json_file_path = f"../../datasets/{PREFIX}/queries_{PREFIX}.json"
with open(json_file_path, encoding='utf-8', errors='ignore') as json_data:
    queries = json.load(json_data, strict=False)
queries = queries['queries']
queries

# Rodando as consultas no LLM para gerar SQL

In [None]:


# Every time you consult it, there will be a 10s delay to avoid API blockage.

count = 0

for instance in queries:
    with get_openai_callback() as cb:
        start_time = time.time()
        sql_query = query_chain.invoke({"question":instance["question"]})
        # this uses PROMPT template by filling it with input, table_info, top_k (possibly 0), sends filled prompt to GPT 4 and gets back SQL query
        end_time = time.time()
        instance["query_string"] = sql_query.replace('\n', ' ').strip()
        instance['total_tokens'] = cb.total_tokens
        instance['prompt_tokens'] = cb.prompt_tokens
        instance['completion_tokens'] = cb.completion_tokens
        instance['total_cost'] = cb.total_cost
        instance['time'] = end_time - start_time
        print(instance['id'], instance['question'], sql_query, instance['time'], instance['total_cost'])
    save_queries(queries)
    time.sleep(2)
queries

## Prompt Gerado pelo Langchain

In [None]:
# sql_query_chain_prompt = query_chain.middle[0].template.format(table_info=db.get_table_info(), top_k=0, input="{input}")
# New langChain version returns RunnableSequence instead of Chain object, doesnt allow the above method

# This gives an idea of what the prompt would have looked like when sending to GPT-4
sql_query_chain_prompt = PROMPT.format(
    table_info=db.get_table_info(), 
    top_k=0, 
    input="{input}"
)
print(sql_query_chain_prompt)

#### Fixing queries

In [None]:
to_fix = [40,59,62,72,85,99]
for pos in to_fix:
    instance = queries[pos]
    q = read_queries()
    with get_openai_callback() as cb:
            start_time = time.time()
            sql_query = query_chain.invoke({"question":instance["question"]})
            end_time = time.time()
            instance["query_string"] = sql_query
            instance['total_tokens'] = cb.total_tokens
            instance['prompt_tokens'] = cb.prompt_tokens
            instance['completion_tokens'] = cb.completion_tokens
            instance['total_cost'] = cb.total_cost
            instance['time'] = end_time - start_time
            q[pos] = instance
            print(instance['id'], instance['question'], instance["query_string"], instance['time'], instance['total_cost'])
            save_queries(q)