In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

#https://huggingface.co/gaussalgo/T5-LM-Large-text2sql-spider
model_path = 'gaussalgo/T5-LM-Large-text2sql-spider'
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

## Utils

In [9]:
def text_2_sql(input_text):
    """ This function takes a natural language question and a schema as input and returns the SQL query"""
    #Encode the input text
    model_inputs = tokenizer(input_text, return_tensors="pt")
    #Invoke the model
    outputs = model.generate(**model_inputs, max_length=512)
    #Decode the model outputs
    output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return output_text

### Example 1

In [2]:
question = "What is the average, minimum, and maximum age for all French musicians?"
schema = """
   "stadium" "Stadium_ID" int , "Location" text , "Name" text , "Capacity" int , "Highest" int , "Lowest" int , "Average" int , foreign_key:  primary key: "Stadium_ID" [SEP] "singer" "Singer_ID" int , "Name" text , "Country" text , "Song_Name" text , "Song_release_year" text , "Age" int , "Is_male" bool , foreign_key:  primary key: "Singer_ID" [SEP] "concert" "concert_ID" int , "concert_Name" text , "Theme" text , "Year" text , foreign_key: "Stadium_ID" text from "stadium" "Stadium_ID" , primary key: "concert_ID" [SEP] "singer_in_concert"  foreign_key: "concert_ID" int from "concert" "concert_ID" , "Singer_ID" text from "singer" "Singer_ID" , primary key: "concert_ID" "Singer_ID"
"""

input_text = " ".join(["Question: ",question, "Schema:", schema])
print(input_text)


Question:  What is the average, minimum, and maximum age for all French musicians? Schema: 
   "stadium" "Stadium_ID" int , "Location" text , "Name" text , "Capacity" int , "Highest" int , "Lowest" int , "Average" int , foreign_key:  primary key: "Stadium_ID" [SEP] "singer" "Singer_ID" int , "Name" text , "Country" text , "Song_Name" text , "Song_release_year" text , "Age" int , "Is_male" bool , foreign_key:  primary key: "Singer_ID" [SEP] "concert" "concert_ID" int , "concert_Name" text , "Theme" text , "Year" text , foreign_key: "Stadium_ID" text from "stadium" "Stadium_ID" , primary key: "concert_ID" [SEP] "singer_in_concert"  foreign_key: "concert_ID" int from "concert" "concert_ID" , "Singer_ID" text from "singer" "Singer_ID" , primary key: "concert_ID" "Singer_ID"



In [4]:
# encode the input text
model_inputs = tokenizer(input_text, return_tensors="pt")
#print(model_inputs)

In [5]:
#Invoke the model
outputs = model.generate(**model_inputs, max_length=512)
#print(outputs) #Latencia 6.1s

In [6]:
# Decode the model outputs
output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print("SQL Query:")
print(output_text)

SQL Query:
["SELECT avg(age), min(age), max(age) FROM singer WHERE country = 'France'"]


Resultado: OK Latencia: 6.1s

### Example 2

In [8]:
question = "What is the avg_lead_time_days  for the item with the id 1009AA ?"
schema = """
   
CREATE TABLE "Stock" (
	id TEXT, 
	current_stock_quantity FLOAT, 
	units TEXT, 
	avg_lead_time_days BIGINT, 
	maximum_lead_time_days BIGINT, 
	unit_price FLOAT
)
"""

input_text = " ".join(["Question: ",question, "Schema:", schema])
print(input_text)

Question:  What is the avg_lead_time_days  for the item with the id 1009AA ? Schema: 
   
CREATE TABLE "Stock" (
	id TEXT, 
	current_stock_quantity FLOAT, 
	units TEXT, 
	avg_lead_time_days BIGINT, 
	maximum_lead_time_days BIGINT, 
	unit_price FLOAT
)



In [10]:
text_2_sql(input_text)

['SELECT avg_lead_time_days FROM stock WHERE id = 1009AA']

Resultado: OK Latencia: 3.3s

### Benchmark

In [38]:
import json
import glob

def load_benchmark(benchmark_path):
    """ This function loads the benchmark dataset"""
    questions = []
    expected_sql = []
    with open(benchmark_path, 'r', encoding='utf-8') as f:
        try:
            datos = json.load(f)
            questions.extend(datos.keys())
            expected_sql.extend(datos.values())
        except Exception as e:
            print(f,e)
    return questions, expected_sql

questions , expected_sql = load_benchmark("../tests/dataset_queries_en.json")

In [39]:
questions

['Get all records from the Stock table',
 'List the IDs of products in stock with their current quantity',
 'Get the average unit price of all products in stock',
 'Find products with more than 10,000 units in stock',
 'Get the total quantity of products in stock',
 'List orders placed in the year 2023',
 'Get the product with the highest maximum lead time',
 'List products whose unit price is above the average',
 'Get the total number of orders per product',
 'List products in stock that have been ordered at least once']

In [40]:
expected_sql

['SELECT * FROM Stock;',
 'SELECT id, current_stock_quantity FROM Stock;',
 'SELECT AVG(unit_price) AS avg_unit_price FROM Stock;',
 'SELECT id, current_stock_quantity FROM Stock WHERE current_stock_quantity > 10000;',
 'SELECT SUM(current_stock_quantity) AS total_stock FROM Stock;',
 "SELECT * FROM Orders WHERE strftime('%Y', date) = '2023';",
 'SELECT id, maximum_lead_time_days FROM Stock ORDER BY maximum_lead_time_days DESC LIMIT 1;',
 'SELECT id, unit_price FROM Stock WHERE unit_price > (SELECT AVG(unit_price) FROM Stock);',
 'SELECT id, COUNT(*) AS total_orders FROM Orders GROUP BY id;',
 'SELECT DISTINCT s.id, s.current_stock_quantity FROM Stock s JOIN Orders o ON s.id = o.id;']

* Test cases

In [44]:
import pandas as pd
import time

def eval_benchmark(schema:str, benchmark_path:str)->pd.DataFrame:
	""" This function evaluates the benchmark dataset"""

	# Load the benchmark dataset
	questions , expected_sql = load_benchmark(benchmark_path)
	# Create a dataframe to store the results
	df = pd.DataFrame({"question":questions, "expected_sql":expected_sql})

	latencies = []
	answers = []
	for index, row in df.iterrows():
		try:
			question = row["question"]
			expected_sql = row["expected_sql"]
			# Prepare the input text for the model adding the question and the schema
			input_text = " ".join(["Question: ",question, "Schema:", schema])
			# Measure the latency
			start_time = time.time()
			# Invoke the model
			predicted_sql = text_2_sql(input_text)
			# Measure the latency
			latency = time.time() - start_time
			latencies.append(latency)
			# Store the predicted SQL query
			answers.append(predicted_sql)
		except Exception as e:
			answers.append("ERROR")
			latencies.append(-1)
			print(f"Error processing question: {question}")

	# Store the results in the dataframe
	df["predicted_sql"] = answers
	df["latency"] = latencies
	return df

### Evaluation



In [45]:
schema = """
   
CREATE TABLE "Stock" (
	id TEXT, 
	current_stock_quantity FLOAT, 
	units TEXT, 
	avg_lead_time_days BIGINT, 
	maximum_lead_time_days BIGINT, 
	unit_price FLOAT
)

CREATE TABLE "Orders" (
	date DATETIME, 
	id TEXT, 
	quantity FLOAT
)
"""

In [46]:
df_eval = eval_benchmark(schema, "../tests/dataset_queries_en.json")

In [48]:
df_eval.head(10)

Unnamed: 0,question,expected_sql,predicted_sql,latency
0,Get all records from the Stock table,SELECT * FROM Stock;,[SELECT * FROM Stock],1.266961
1,List the IDs of products in stock with their c...,"SELECT id, current_stock_quantity FROM Stock;",[SELECT id FROM stock AND current_stock_quanti...,2.375918
2,Get the average unit price of all products in ...,SELECT AVG(unit_price) AS avg_unit_price FROM ...,[SELECT avg(unit_price) FROM stock],2.060799
3,"Find products with more than 10,000 units in s...","SELECT id, current_stock_quantity FROM Stock W...","[SELECT product FROM stock WHERE units > 10,000]",1.884214
4,Get the total quantity of products in stock,SELECT SUM(current_stock_quantity) AS total_st...,[SELECT sum(units) FROM stock],1.629552
5,List orders placed in the year 2023,"SELECT * FROM Orders WHERE strftime('%Y', date...","[SELECT order_id FROM orders WHERE date LIKE ""...",2.491136
6,Get the product with the highest maximum lead ...,"SELECT id, maximum_lead_time_days FROM Stock O...",[SELECT product FROM stock ORDER BY max_lead_t...,3.059936
7,List products whose unit price is above the av...,"SELECT id, unit_price FROM Stock WHERE unit_pr...",[SELECT product FROM stock WHERE unit_price > ...,3.819206
8,Get the total number of orders per product,"SELECT id, COUNT(*) AS total_orders FROM Order...","[SELECT sum(t1.quantity), t2.product_id FROM o...",6.866213
9,List products in stock that have been ordered ...,"SELECT DISTINCT s.id, s.current_stock_quantity...",[SELECT DISTINCT T2.product_id FROM orders AS ...,7.005002


In [49]:
df_eval.to_csv("../tests/df_benchmark_results.csv", index=False)

In [65]:
import plotly.graph_objects as go

# Crear figura
fig = go.Figure(data=[go.Bar(x=df_eval.question, y=df_eval.latency)])

# Configurar ejes
fig.update_layout(
    title=f'Our Text2SQL Benchmark dataset to evaluate the performance of {model_path}',
    xaxis_title='Question',
    yaxis_title='Latency (s)'
)
fig.update_layout(title=dict(font_size=12, font_weight='bold', x=0.5))
fig.update_traces(marker_line_color='#3DD2CF', marker_line_width=2, marker_color="#009999")
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', yaxis=dict(gridcolor='lightgrey'), height=600, width=1000)

# Mostrar figura
fig.show()

TODO: Falta evaluar si la SQL query generada es correcta o no. (Ver excel y pensar en la forma de autoevaluar esto)