# Text to SQL Similarity

The purpose of this task is to asses your independent work and familiarity with data analysis, Machine Learning and Natural Language Processing techniques. Feel free to use any libraries you want, any models necessary and to define any additional functions you might need. 

You are given a dataset with questions and their corresponding SQL queries.

Tasks:
1. Define 2 different similarity metrics between a question and a SQL query (your choice - be creative). The metrics should use different techniques and be based on different models.
2. Analyze the similarity metrics you defined above.
    
    * Define plots to visualize the performance of the similarity metrics (your choice - be creative)
    * Define a threshold (and how to choose it) for the similarity metrics to determine if a question is similar to a SQL query.
    * Calculate the precision, recall, and F1 score per similarity metric (for the threshold you defined) or other metrics of your choice that will help you compare between the metrics.
3. Summarize your findings and conclusions.




In [1]:
import pandas as pd
import numpy as np
import os

from tabulate import tabulate
from dotenv import load_dotenv
from datasets import Dataset
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, SemanticSimilarity
from ragas import evaluate

load_dotenv()
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('data.csv')
 
 

In [None]:

print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print(df["inquiry_id"].unique().size)
print(df["question"].unique().size)
print(df["sql"].unique().size)

df= df.iloc[:2]

Number of rows: 75
Number of columns: 3
75
75
74


In [4]:
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else ("" if pd.isnull(x) else x))
print (df.isnull().sum())

inquiry_id    0
question      0
sql           0
dtype: int64


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else ("" if pd.isnull(x) else x))


In [5]:
id_map = {}
counter = 1

# Step 2: Generate IDs for each row based on inquiry_id
def get_or_assign_id(inquiry):
    global counter
    if inquiry not in id_map:
        id_map[inquiry] = counter
        counter += 1
    return id_map[inquiry]

# Add the new 'id' column
df["id"] = df["inquiry_id"].apply(get_or_assign_id)

column_order = ["id", "inquiry_id", "question", "sql"]
df = df[column_order]
print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False))

+----+------------------+-------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| id |    inquiry_id    |                    question                     |                                                                                                                                          sql                                                                                                                                          |
+----+------------------+-------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
df_send_LLM=df.copy

In [7]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

df["retrieved_contexts"] = df["sql"].apply(lambda x: [x])  
df["reference"] = df["question"]
df["response"] = " "
 
dataset = Dataset.from_pandas(df)

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4-turbo"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
 
metrics = [
    LLMContextRecall(llm=evaluator_llm),
    FactualCorrectness(llm=evaluator_llm),
    Faithfulness(llm=evaluator_llm),
    SemanticSimilarity(embeddings=evaluator_embeddings),
    context_precision ,
    answer_relevancy,
    context_recall  
]


results = evaluate(dataset=dataset, metrics=metrics)
df_eval = results.to_pandas()
 


Evaluating:  29%|██▊       | 2/7 [00:02<00:05,  1.14s/it]No statements were generated from the answer.
Evaluating:  71%|███████▏  | 5/7 [00:05<00:02,  1.01s/it]Exception raised in Job[1]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Evaluating: 100%|██████████| 7/7 [00:09<00:00,  1.32s/it]


In [8]:
df_eval

Unnamed: 0,user_input,retrieved_contexts,response,reference,context_recall,factual_correctness,faithfulness,semantic_similarity,context_precision,answer_relevancy
0,In what countries are most of our jobs offered?,"[SELECT t.country_name AS Country, COUNT(*) AS...",,In what countries are most of our jobs offered?,0.0,,,0.739802,0.0,0.0


In [9]:
print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False))

+----+------------------+-------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------+----------+
| id |    inquiry_id    |                    question                     |                                                                                                                                          sql                                                                                   

In [17]:
import pandas as pd
import openai
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
from openai import OpenAI

openai.api_key =  OPENAI_API_KEY
client = OpenAI()
 

# Function to predict the question based on SQL query
def get_predicted_question(sql_query):
    completion = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant who is really good at generating questions for SQL queries."},
            {"role": "user", "content": f"Generate the question that the following SQL query is answering:\n\n{sql_query}"}
        ]
    )
    return completion.choices[0].message.content

def get_similarity_score(original_question, predicted_question):
    completion = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant who is really good at sentence similarity matching.And you give the output always as a float number only. you never provide a String value or the descrition about your response"},
            {"role": "user", "content": f"Rate the similarity between these two questions on a scale from 0 to 1 (1 being identical). Make sure your only allowed response is only a float value with maximum 2 demial points. finla response only need to be that:\n\n1. {original_question}\n2. {predicted_question}"}
        ]
    )
    return  completion.choices[0].message.content.strip()


def get_direct_query_match_score(original_question, sql_query):
    completion = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant who is really good at   matching the relavancy of the sql query and the it's original question . you always give response as a interger only.every time your answer is a number . there is no explnation   "},
            {"role": "user", "content": f"Rate the realavancy between these   questions  and the sql statement  on a scale from 0 to 1 (1 being identical). Make sure your only allowed response is only a float value with maximum 2 demial points. finla response only need to be that:\n\n1. {original_question}\n2. {predicted_question}"}
        ]
    )
    return  completion.choices[0].message.content.strip()

predicted_questions = []
similarity_scores = []
sql_matching_scores = []

 
for index, row in df.iterrows():
    sql_query = row["sql"]
    original_question = row["question"]
    
    # Get predicted question
    predicted_question = get_predicted_question(sql_query)
    predicted_questions.append(predicted_question)
    sql_matching_scores.append(get_direct_query_match_score(original_question ,original_question))
    
    # Calculate similarity score
    similarity_score = get_similarity_score(original_question, predicted_question)
    similarity_scores.append(similarity_score)

# Add results to DataFrame
df["predicted_question"] = predicted_questions
df["similarity_score"] = similarity_scores
df["sql_matching_scores"] = similarity_scores

# Print the updated DataFrame
print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False))


+----+------------------+-------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------+----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+---------------------+
| id |    inquiry_id    |                    question                    

# Metric 1:

# Threshold Selection Metric 1:


# Metric 2:

# Threshold Selection Metric 2:

# Metrics Analysis:

# Conclusion and Findings: