<a href="https://colab.research.google.com/github/MbohBless/Playing_with_groq/blob/main/NL2SQL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install -q groq
%pip install -q duckdb
%pip install -q sqlparse

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.5/103.5 kB[0m [31m954.1 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import duckdb
import groq
import sqlparse
import json
from typing import Dict, Any, List
import logging
import pandas as pd
from pathlib import Path

In [None]:
# configuting logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:

def chat_with_groq(client:groq.Groq,
                   prompt:str,
                   model:str,
                   response_format: Dict[str, str]
                   )-> Any:
                   """
                   The function here works for the prompting the model for the qroq based model with natural language to
                   geegnrate the appropriate SQL query for the provided data and prompt. So with this output, can then be used to create proper
                   SQL queries and aggregation

                   Args:
                    client: This represents the Groq client which will be used top perform the conversion of the natural language to SQL.
                    prompt: Bases on the user message provided, the prompt will then be converted to SQL for future use for the data querying.
                    model: The Model here represents the based open source model that is used to make the query on. examples include llama3-70b-8192 or llama3-8b-8192 find out more here https://console.groq.com/docs/models
                    response_format: As model has the possibility of supporting JSON and MD outputs, this is a dictionary which specifies the the required return response format. by default this for the model is usually targetted
                    to returning an MD data but in the case of NL2SQL, then we coule work with SQL

                   Returns:
                   the function returns some data based on the response_format response form the model's response
                   """
                   completion = client.chat.completions.create(
                       model = model,
                       messages = [
                           {
                               "role":"user",
                               "content":prompt
                           }
                       ],
                       response_format=response_format
                   )
                  #  logger.info(f"Completion: {completion}")
                   return completion.choices[0].message.content



In [None]:
def execute_duckdb_query(query:str)->pd.DataFrame:
    """
    Execute a DuckDB query and return the result as a pandas DataFrame.

    Args:
        query (str): The DuckDB query to execute.

    Returns:
        pd.DataFrame: The result of the query as a pandas DataFrame.
    """
    original_cwd = os.getcwd()
    os.chdir('data')

    try:
        conn = duckdb.connect(database=":memory:", read_only=False)
        query_result = conn.execute(query).fetch_df().reset_index()
    finally:
        os.chdir(original_cwd)

    return query_result

In [None]:
def get_summarization(client:groq.Groq,
                      use_question:str,
                      df:pd.DataFrame,
                      model:str)->Any:
                      """
                      For this query, the user input is better summarized around the provided Dataframe. This give a better contextual structure for the user to perfom the action

                      Args:
                        client: This represents the Groq client which will be used top perform the conversion of the natural language to SQL.
                        use_question: Bases on the user message provided, the prompt will then be converted to SQL for future use for the data querying.
                        model: The Model here represents the based open source model that is used to make the query on. examples include llama3-70b-8192 or llama3-8b-8192 find out more here https://console.groq.com/docs/models
                        df: this is a pandas dataframe which containe the database daat which will then be used to summarize the base query around it, making the prompt more realistic.

                      Returns:
                      the function returns some json response form the model's response
                      """
                      prompt= '''
                      A user asked the following question pertaining to local database tables:

                      {user_question}

                      To answer the question, a dataframe was returned:

                      Dataframe:
                      {df}

                      In a few sentences, summarize the data in the table as it pertains to the original user question. Avoid qualifiers like "based on the data" and do not comment on the structure or metadata of the table itself
                      '''.format(user_question = use_question, df = df)
                      return chat_with_groq(client,prompt,model,None)


In [None]:
# data download
def download_file(url:str, file_path:str):
    import requests
    logger.info(f"Downloading {url} to {file_path}")
    response = requests.get(url)
    response.raise_for_status()
    with open(file_path, 'wb') as f:
        f.write(response.content)
    logging.info(f"Downloaded {url} to {file_path}")

In [None]:
def create_dir_if_not_available(path):
    logging.warning(f"Creating directory {path}\n")
    Path(path).mkdir(parents=True, exist_ok=True)
    return path

In [None]:
data_path = create_dir_if_not_available("data")
file_names = ["employees.csv","purchases.csv"]
file_locations = ["https://raw.githubusercontent.com/groq/groq-api-cookbook/main/replit-examples/text-to-sql-json-mode/data/employees.csv","https://raw.githubusercontent.com/groq/groq-api-cookbook/main/replit-examples/text-to-sql-json-mode/data/purchases.csv"]

for file_name, file_location in zip(file_names, file_locations):
    download_file(file_location, os.path.join(data_path, file_name))




In [None]:
prompt_path = create_dir_if_not_available("prompts")
prompt_file_names = ["base_prompt.txt"]
prompt_file_locations = ["https://raw.githubusercontent.com/groq/groq-api-cookbook/main/replit-examples/text-to-sql-json-mode/prompts/base_prompt.txt"]

for file_name, file_location in zip(prompt_file_names, prompt_file_locations):
    download_file(file_location, os.path.join(data_path, file_name))




In [None]:
from google.colab import userdata

In [None]:
def main_func(base_prompt:str,model:str = "llama3-70b-8192",api_key:str=userdata.get("GROQ_API_KEY")):
  client = groq.Groq(api_key=api_key)
  print("Welcome to Groq Text to SQL")
  print("You can as questions about the data employee.csv and purchases.csv files")

  while True:
    user_question = input("Enter your question or type 'exit' to quit: ")
    if user_question.lower() == 'exit':
      break
    if user_question:
      full_prompt = base_prompt.format(user_question=user_question)
      response = chat_with_groq(client,full_prompt,model,{
          "type":"json_object"
      })
      response = json.loads(response)
      if "sql" in response:
        print(response)
        sql_query = response["sql"]
        results_df = execute_duckdb_query(sql_query)

        fotmatted_sql_query = sqlparse.format(sql_query, reindent=True, keyword_case='upper')
        print(f"SQL Query: {fotmatted_sql_query}")
        print(results_df.to_markdown())
        summarization = get_summarization(client,user_question,results_df,model)
        print(summarization)
      else:
        print(response)


In [None]:
 with open('prompts/base_prompt.txt', 'r') as file:
        base_prompt = file.read()

In [None]:
main_func(base_prompt=base_prompt)


Welcome to Groq Text to SQL
You can as questions about the data employee.csv and purchases.csv files
Enter your question or type 'exit' to quit: get all employees
{'sql': 'SELECT name, email FROM employees.csv AS employees'}
SQL Query: SELECT name,
       email
FROM employees.csv AS employees
|    |   index | name              | email                  |
|---:|--------:|:------------------|:-----------------------|
|  0 |       0 | Richard Hendricks | richard@piedpiper.com  |
|  1 |       1 | Erlich Bachman    | erlich@aviato.com      |
|  2 |       2 | Dinesh Chugtai    | dinesh@piedpiper.com   |
|  3 |       3 | Bertram Gilfoyle  | gilfoyle@piedpiper.com |
|  4 |       4 | Jared Dunn        | jared@piedpiper.com    |
|  5 |       5 | Monica Hall       | monica@raviga.com      |
|  6 |       6 | Gavin Belson      | gavin@hooli.com        |
The data shows a list of 7 employees, including Richard Hendricks, Erlich Bachman, and Dinesh Chugtai, all of whom have email addresses associated w