# Inference with OpenAI and RAG

In [None]:
!pip install openai

Collecting openai
  Downloading openai-1.36.1-py3-none-any.whl (328 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/328.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.8/328.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00

In [None]:
import json

similar_test_path = "/content/drive/MyDrive/spider/test_similar_questions/sampled_similar_questions_v2.json"
with open(similar_test_path, 'r') as f:
    similar_data = json.load(f)

In [None]:
db_test_path = "/content/drive/MyDrive/spider/db_tables_str.json"
with open(db_test_path, 'r') as f:
    db_data = json.load(f)

In [None]:
## similar samples and table info
alpaca_prompt = """
### Instruction:
You are a helpful assistant that produces SQL Queries from the given question.
You should output a SQL query that is syntactically correct and can be executed without error.
In the input, you will recieve the question and similar text-to-sql pairs in addition to database tables for question.
Please output in the following format so it would be easy to extract SQL query. You should put sql code between
```sql SQL ```:
```sql
SELECT count(*) FROM head WHERE age  >  56;
```


### Input:
{}

Question: {}
Database: {}
Database tables:
{}

SQL:
"""

In [None]:
## Just table info
alpaca_prompt_v2 = """
### Instruction:
You are a helpful assistant that produces SQL Queries from the given question.
You should output a SQL query that is syntactically correct and can be executed without error.
In the input, you will recieve the question and  database tables for question.
Please output in the following format so it would be easy to extract SQL query. You should put sql code between
```sql SQL ```:
```sql
SELECT count(*) FROM head WHERE age  >  56;
```


### Input:

Question: {}
Database: {}
Database tables:
{}

SQL:
"""

In [None]:
import os
from openai import OpenAI

client = OpenAI(
    # This is the default and can be omitted
    api_key="OPEN_AI_KEY",
)

In [None]:
import openai
import re
import time

# Function to extract SQL statements from text
def extract_sql_statements(text):
    sql_statements = re.findall(r'```sql(.*?)```', text, re.DOTALL)
    return [statement.strip() for statement in sql_statements]

# Function to generate SQL query using OpenAI's API
def generate_sql_query(similar_data, db_data, alpaca_prompt):
    predicted_queries = []
    full_results = []

    for item in similar_data:
        time.sleep(2)
        question = item['question']
        db_id = item['db_id']
        tabs = db_data[db_id]
        prompt = alpaca_prompt.format(question, db_id, "\n".join(tabs))
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that produces SQL Queries from the given question."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=2000,
            temperature=0
        )
        output_text = response.choices[0].message.content
        sql_query = extract_sql_statements(output_text)[-1]
        sql = sql_query.split(";")[0] + ";"
        predicted_queries.append(sql)
        print(sql)
        full_results.append(output_text)

    return predicted_queries, full_results

In [None]:
predicted_queries, full_results = generate_sql_query(similar_data, db_data, alpaca_prompt_v2)

SELECT s.name
FROM sailors s
JOIN reserves r ON s.sid = r.sid
WHERE r.bid = 103;
SELECT s.name
FROM sailors s
JOIN reserves r ON s.sid = r.sid
JOIN boats b ON r.bid = b.bid
WHERE b.name = 'Melon';
SELECT AVG(Units_sold_Millions) AS avg_units_sold
FROM game
JOIN game_player ON game.Game_ID = game_player.Game_ID
JOIN player ON game_player.Player_ID = player.Player_ID
WHERE player.Position = 'Guard';
SELECT Collection_Subset_Name
FROM collections
WHERE Parent_Collection_ID = (SELECT Collection_ID FROM collections WHERE Collection_Name = 'Best');
SELECT Analytical_Layer_Type_Code, COUNT(*) AS appearance_count
FROM analytical_layer
GROUP BY Analytical_Layer_Type_Code
ORDER BY appearance_count DESC
LIMIT 1;
SELECT Nationality, COUNT(Customer_ID) AS Number_of_Customers
FROM customer
GROUP BY Nationality;
SELECT c.customer_id, c.customer_name, c.customer_phone, c.customer_email
FROM customers c
JOIN customer_orders co ON c.customer_id = co.customer_id
GROUP BY c.customer_id, c.customer_name, c

In [None]:
predicted_queries

['SELECT s.name\nFROM sailors s\nJOIN reserves r ON s.sid = r.sid\nWHERE r.bid = 103;',
 "SELECT s.name\nFROM sailors s\nJOIN reserves r ON s.sid = r.sid\nJOIN boats b ON r.bid = b.bid\nWHERE b.name = 'Melon';",
 "SELECT AVG(Units_sold_Millions) AS avg_units_sold\nFROM game\nJOIN game_player ON game.Game_ID = game_player.Game_ID\nJOIN player ON game_player.Player_ID = player.Player_ID\nWHERE player.Position = 'Guard';",
 "SELECT Collection_Subset_Name\nFROM collections\nWHERE Parent_Collection_ID = (SELECT Collection_ID FROM collections WHERE Collection_Name = 'Best');",
 'SELECT Analytical_Layer_Type_Code, COUNT(*) AS appearance_count\nFROM analytical_layer\nGROUP BY Analytical_Layer_Type_Code\nORDER BY appearance_count DESC\nLIMIT 1;',
 'SELECT Nationality, COUNT(Customer_ID) AS Number_of_Customers\nFROM customer\nGROUP BY Nationality;',
 'SELECT c.customer_id, c.customer_name, c.customer_phone, c.customer_email\nFROM customers c\nJOIN customer_orders co ON c.customer_id = co.custome

In [None]:
revised_filtered_queries = []
for pred in predicted_queries:
    pred = pred.replace("\n", " ")
    if pred[-1] != ';':
        pred += ';'
    revised_filtered_queries.append(pred)

revised_filtered_queries

['SELECT s.name FROM sailors s JOIN reserves r ON s.sid = r.sid WHERE r.bid = 103;',
 "SELECT s.name FROM sailors s JOIN reserves r ON s.sid = r.sid JOIN boats b ON r.bid = b.bid WHERE b.name = 'Melon';",
 "SELECT AVG(Units_sold_Millions) AS avg_units_sold FROM game JOIN game_player ON game.Game_ID = game_player.Game_ID JOIN player ON game_player.Player_ID = player.Player_ID WHERE player.Position = 'Guard';",
 "SELECT Collection_Subset_Name FROM collections WHERE Parent_Collection_ID = (SELECT Collection_ID FROM collections WHERE Collection_Name = 'Best');",
 'SELECT Analytical_Layer_Type_Code, COUNT(*) AS appearance_count FROM analytical_layer GROUP BY Analytical_Layer_Type_Code ORDER BY appearance_count DESC LIMIT 1;',
 'SELECT Nationality, COUNT(Customer_ID) AS Number_of_Customers FROM customer GROUP BY Nationality;',
 'SELECT c.customer_id, c.customer_name, c.customer_phone, c.customer_email FROM customers c JOIN customer_orders co ON c.customer_id = co.customer_id GROUP BY c.custo

In [None]:
def substitute_consecutive_spaces(text):
    # Substitute consecutive spaces with a single space
    return re.sub(r'\s+', ' ', text)

In [None]:
def substitute_percent(text):
    # Substitute consecutive spaces with a single space
    return re.sub(r'%', '', text)

In [None]:
refined = []
for item in revised_filtered_queries:
    rf = substitute_consecutive_spaces(item)
    substitute_percent(rf)
    refined.append(substitute_consecutive_spaces(item))

In [None]:
for r in refined:
    if r[-1] != ";":
        print("yes")

In [None]:
with open('/content/drive/MyDrive/spider/results/sampled_tables_wo_samples_gpt3.5.txt', mode='wt', encoding='utf-8') as myfile:
    myfile.write('\n'.join(refined))

# Both samples and tables

In [None]:
import openai
import re
import time

# Function to extract SQL statements from text
def extract_sql_statements(text):
    sql_statements = re.findall(r'```sql(.*?)```', text, re.DOTALL)
    return [statement.strip() for statement in sql_statements]

# Function to convert questions to a formatted string
def questions_to_str(q_list):
    q_sql_format = """
        Question: {}
        Database: {}
        SQL:{}

    """
    q_all = " "
    for q in q_list:
        q_all += q_sql_format.format(q['original_question'], q['db_id'], q['sql_query'])

    return q_all

# Function to generate SQL query using OpenAI's API
def generate_sql_query(similar_data, db_data, alpaca_prompt):
    predicted_queries = []
    full_results = []

    for item in similar_data:
        time.sleep(2)
        question = item['question']
        similar_questions = item['similar_q']
        db_id = item['db_id']
        tabs = db_data[db_id]
        similar_formatted = questions_to_str(similar_questions)
        prompt = alpaca_prompt.format(similar_formatted, question, db_id, "\n".join(tabs))
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that produces SQL Queries from the given question."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=2000,
            temperature=0
        )
        output_text = response.choices[0].message.content
        sql_query = extract_sql_statements(output_text)[-1]
        sql = sql_query.split(";")[0] + ";"
        predicted_queries.append(sql)
        print(sql)
        full_results.append(output_text)

    return predicted_queries, full_results

In [None]:
predicted_queries, full_results = generate_sql_query(similar_data, db_data, alpaca_prompt_v2)

In [None]:
revised_filtered_queries = []
for pred in predicted_queries:
    pred = pred.replace("\n", " ")
    if pred[-1] != ';':
        pred += ';'
    revised_filtered_queries.append(pred)

revised_filtered_queries

In [None]:
refined = []
for item in revised_filtered_queries:
    rf = substitute_consecutive_spaces(item)
    substitute_percent(rf)
    refined.append(substitute_consecutive_spaces(item))

In [None]:
with open('/content/drive/MyDrive/spider/results/sampled_tables_samples_gpt3.5.txt', mode='wt', encoding='utf-8') as myfile:
    myfile.write('\n'.join(refined))