In [112]:
%pip install urllib3
import os
import pandas as pd
import time
import asyncio

# Load in AAM
from askametric.query_processor.query_processor import LLMQueryProcessor
from askametric.utils import _ask_llm_json

# Load in Vanna
from vanna.openai import OpenAI_Chat
from vanna.vannadb import VannaDB_VectorStore

from litellm import cost_per_token, token_counter, APIConnectionError
from dotenv import load_dotenv

from sqlalchemy import text
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.asyncio import (
    AsyncSession,
    create_async_engine,
)

# Load Environment Variables
load_dotenv(".env")

Note: you may need to restart the kernel to use updated packages.


True

Parameters used for comparisons

In [113]:
# Parameters
which_db = "tn_covid_cases_11_may"
llm = "gpt-4o"
guardrails_llm = "gpt-4o"
sys_message = "Government and health officials in Tamil Nadu, India will ask you questions. You need to help them manage COVID cases and the availablity of beds in health facilities."
db_description = "- bed_vacancies_clinics_11_may: Each row identifies a district and a JSON object including the beds earmarked, occupied and available for COVID cases in the district clinics.\
- bed_vacancies_health_centers_and_district_hospitals_11_may: Each row identifies a district and the beds earmarked, occupied and available, with and without oxygen supply, and with and without ICU support, for COVID cases in the disctrict health centers and hospitals.\
- covid_cases_11_may: Each row identifies a district and the number of people who received treatment, were discharged and died due to COVID.\
"
num_common_values = 10
db_path = "demo_databases/tn_covid_cases_11_may.sqlite"

Prompts used to instruct the LLM how to evaluate AAM and Vanna's performance

In [114]:
# Prompts to validate AAM and Vanna responses to expected output

def get_system_validation_prompts() -> tuple[str, str]:
    """
    Create prompt to validate the final answer text with respect to the inital
    question.

    Returns: a tuple of the system message for text validation and the system
        message for SQL validation
    """
    system_text_message = """
    You are a grading bot. You will get messages in the following format -

    ----Message Begins----------------
    Question: ```<Some Question>```
    Answer: ```<Answer to be graded>```
    Correct Answer: ```<Correct Answer to Question>```
    Correct Language: ```<Correct Language the "Answer" should be in with the language script if relevant>```
    ----Message Ends----------------

    Give grades based on the following three points:
    (a) Is the "Answer" similar in meaning to "Correct Answer"?
    If yes, give a grade of 1, otherwise 0. Remember,
    the "Answer" and "Correct Answer" ONLY NEED TO BE SIMILAR in general meaning.
    (b) Does the "Answer" address the key elements of the "Question"?
    If yes, give a grade of 1, otherwise 0.
    (c) What is the language the "Question" was asked in? What language script was it in e.g. (Latin, Devanagari, etc.)?
    Does the language and script of the "Question" match what is listed in "Correct Language"?
    If yes, give a grade of 1, otherwise 0.

    Overall score is 1 if all three (a), (b), and (c) are 1, otherwise 0.

    Reply in the following json format -
    {"overall": your Overall score, "reason": reason for grading}

    Overall scores are a 0 or 1. Nothing else."
    """  

    system_sql_message = """
    You are a high-functioning SQL-code grading bot.
    You will get messages in the following format -

    ----Message Begins----------------
    Question: ```<Some Question>```
    Table Descriptions: ```<Description of the tables in the database>```
    Schema: ```<Schema of the database>```
    Answer: ```<Answer to be graded>```
    Correct Answer: ```<Correct Answer to Question>```
    ----Message Ends----------------

    Give grades based on the following three points:
    (a) Will the "Answer" result in the same output as "Correct Answer"?
    If yes, give a grade of 1, otherwise 0. Remember,
    the "Answer" and "Correct Answer" ONLY NEED TO BE SIMILAR in the result they generate.
    (b) Does the "Answer" address the key elements of the "Question"?
    If yes, give a grade of 1, otherwise 0.
    (c) Are both "Correct Answer" and "Answer" empty (neither contain SQL code)?
    If yes, give a grade of 1, otherwise 0. If the "Correct Answer" contains SQL code but "Answer" is empty, give a grade of 0.
    If "Correct Answer" is empty but "Answer" contains SQL code, give a grade of 0.

    Overall score is 1 if exactly one of the two conditions are met:
    1. (a) and (b) are both 1
    2. (c) is 1
    Otherwise overall score is 0

    Reply in the following json format -
    {"overall": your Overall score, "reason": reason for grading}.

    Overall scores are a 0 or 1. Nothing else."
    """

    return system_text_message, system_sql_message


def create_sql_message_prompt(
    query: str,
    db_description: str,
    db_schema: str,
    sql_response: str,
    correct_sql_response: str,
) -> str:
    """
    Create prompt for SQL validation

    Args:
        query: the validation question
        db_description: description of the database and its tables
        db_schema: the database schema
        sql_response: the response from the LLM model
        correct_sql_response: the correct and expected SQL response
    """

    sql_message_template = f"""
    ----Message Begins----------------
    Question: ```{query}```
    Table Descriptions: ```{db_description}```
    Schema: ```{db_schema}```
    Answer: ```{sql_response}```
    Correct Answer: ```{correct_sql_response}```
    ----Message Ends----------------
    """

    return sql_message_template


def create_text_message_prompt(
    question: str,
    text_response: str,
    correct_answer: str,
    correct_language: str,
) -> str:
    """
    Create prompt for SQL validation

    Args:
        question: the validation question
        text_response: the response from the one of the LLM models
        detected_language: the langauge detected by the LLM model
        detected_script: the langauge script detected by the LLM model
        correct_answer: the correct text response
        correct_language: the correct and expected language and script
    """

    text_message_template = f"""
    ----Message Begins----------------
    Question: {question}
    Answer: {text_response}
    Correct Answer: {correct_answer}
    Correct Language and Script: {correct_language}
    ----Message Ends----------------
    """

    return text_message_template


SYSTEM_TEXT_MESSAGE, SYSTEM_SQL_MESSAGE = get_system_validation_prompts()


Set up Vanna model

In [115]:
class MyVanna(VannaDB_VectorStore, OpenAI_Chat):
    def __init__(self, vanna_rag_model, config=None):
        VannaDB_VectorStore.__init__(self, vanna_model=vanna_rag_model, vanna_api_key=os.getenv("VANNA_AI_API_KEY"), config=config)
        OpenAI_Chat.__init__(self, config=config)

class VannaModel:
    """ Class to initialize the MyVanna class and prepare for comparisons with AAM """
    def __init__(self, db_path, vanna_rag_model, sys_message, db_description, llm_model="gpt-4o", run_sql=False):
        self.db_path = db_path
        self.rag_model = vanna_rag_model
        self.sys_message = sys_message
        self.db_description = db_description
        self.llm_model = llm_model
        self.sql_schema = run_sql

        self.vn = self.setup_vanna_ai()
        

    def setup_vanna_ai(self):
        """ Setup Vanna.ai """
        vn = MyVanna(
            self.rag_model,
            config={'api_key': os.getenv("OPENAI_API_KEY"), 'model': 'gpt-4o'}
        )

        vn.connect_to_sqlite(self.db_path)
        vn.train(documentation = self.sys_message + "\n\n" + self.db_description)

        if self.sql_schema:
            # Train vanna on SQL schema
            df_ddl = vn.run_sql("SELECT type, sql FROM sqlite_master WHERE sql is not null")
            for ddl in df_ddl['sql'].to_list():
                vn.train(ddl=ddl)

        return vn
    
    def get_vanna_reponse(
        self, question
    ) -> dict[str, str | float]:
        """
        Send request to the vanna.ai and extract responses for a single question

        Args:
            question: a single validation question

        Returns: a dictionary of vanna's response and associated costs
        """
        tic = time.time()  # record the start time

        # Frequent connection errors when running in a loop
        # Not sure if its due to poor internet or something else. The below code seemed to fix it
        count = 0
        while count < 7:
            try:
                vn_answer = self.vn.ask(question)
                break
            except ConnectionError:
                count += 1
                continue
        else:
            raise ConnectionError

        toc = time.time()  # record the end time

        response_time = toc - tic  # calculate the response time

        prompt_token_count = token_counter(
            text = self.vn.get_sql_prompt(
                initial_prompt="",
                question=question,
                question_sql_list=[],
                ddl_list=[],
                doc_list=[db_description]
            )
        )

        response_token_count = token_counter(text=str(vn_answer))

        cost = cost_per_token(
            model=self.llm_model,
            prompt_tokens=prompt_token_count,
            completion_tokens=response_token_count,
        )

        # Vanna returns None if SQL query couldn't be processed
        if not vn_answer:
            text_response = ""
            sql_response = ""
        else:
            text_response = vn_answer[1]
            sql_response = vn_answer[0]


        # Save only the sql query and text outputs along with the cost,
        response = {
            "cost": sum(cost),
            "resposne_time": response_time,
            "text_response": text_response,
            "sql_query": sql_response,
            
        }
        return response

    

Create class to compare AAM and Vanna to what the expected output should be

In [116]:
class ModelComparisons:
    """
    Class that houses a series of functions to evaluate AAM and other models (Vanna.ai)

    Class can be generalized to be easier to adapt to other text-2-SQL platforms
    """

    def __init__(
        self,
        db_name: str,
        db_path: str,
        sys_message: str,
        db_description: str,
        vanna_ai: VannaModel,
        llm_model: str = "gpt-4o"
    ) -> None:
        """Init"""

        self.db_name = db_name
        self.db_path = db_path
        self.sys_message = sys_message
        self.db_description = db_description
        self.vanna_ai = vanna_ai
        self.llm_model = llm_model

        self.get_asession()
        # asyncio.run(self.get_db_schema())

        # Load validation questions based on .env path
        self.eval_questions = pd.read_csv(
            os.environ.get(f"{self.db_name.upper()}_EVAL_QUESTIONS_PATH"), skip_blank_lines=True
        ).dropna(how="all")

    def get_asession(self):
        """ Get assession for db schema and aam """
        aengine = create_async_engine(
            url=f"sqlite+aiosqlite:///{self.db_path}"
        )

        async_session = sessionmaker(
            bind=aengine,
            class_=AsyncSession,
            expire_on_commit=False
        )

        self.aengine = aengine
        self.aam_assession = async_session

    async def get_db_schema(self):
        """ Get db schema from asession"""

        async with self.aengine.connect() as conn:
            a = await conn.execute(
                text("SELECT type, sql FROM sqlite_master WHERE sql is not null")
            )
        self.db_schema = str(a.fetchall())

    async def get_aam_reponse(
        self, question: str 
    ) -> dict[str, str | float]:
        """
        Send single query to the LLM

        Args:
            question: a single validation question

        Returns: a dictionary of aam's response and associated costs
        """
        tic = time.time()  # record the start time

        async with self.aam_assession() as session:
            qp = LLMQueryProcessor(
                {"query_text": question, "query_metadata": {}},
                session,
                which_db,
                self.llm_model,
                self.llm_model,
                self.sys_message,
                self.db_description,
                column_description="",
                num_common_values=num_common_values
            )
            await qp.process_query()

            toc = time.time()  # record the end time
            response_time = toc - tic  # calculate the response time

            # Save only the sql query and text outputs along with the cost
            response = {
                "cost": qp.cost,
                "resposne_time": response_time,
                "text_response": qp.final_answer,
                "sql_query": qp.sql_query,
                
            }
            return response
        

    async def _request_llm_evaluation(
        self, row: pd.DataFrame, model_output: dict, db_schema: str
    ) -> tuple:
        """
        Ask the LLM to evaluate a single question

        Args:
            row: the question and relevant information
            endpoint_response: the response from the LLM (get-metric endpoint)
            db_schema: database schema.
        """

        sql = {
            "system_message": SYSTEM_SQL_MESSAGE,
            "prompt": create_sql_message_prompt(
                query=row.question,
                db_description=db_description,
                db_schema=db_schema,
                sql_response=model_output["sql_query"],
                correct_sql_response=row.correct_sql_response,
            ),
        }

        text = {
            "system_message": SYSTEM_TEXT_MESSAGE,
            "prompt": create_text_message_prompt(
                question=row.question,
                text_response=model_output["text_response"],
                correct_answer=row.correct_answer,
                correct_language=row.language,
            ),
        }

        sql_llm_evaluation = await _ask_llm_json(**sql)
        text_llm_evaluation = await _ask_llm_json(**text)

        eval_output = model_output.copy()
        eval_output["text_overall_score"] = text_llm_evaluation["answer"]["overall"]
        eval_output["text_score_reasoning"] =  text_llm_evaluation["answer"]["reason"]
        eval_output["sql_overall_score"] = sql_llm_evaluation["answer"]["overall"]
        eval_output["sql_score_reasoning"] = sql_llm_evaluation["answer"]["reason"]

        # Need to work on prompt engineering, LLM incorrectly applies logic
        # Code below is a manual fix
        if eval_output["sql_query"] and not row.correct_sql_response:
            eval_output["sql_overall_score"] = 0
            eval_output["sql_score_reasoning"] = "SQL query should be blank, but instead contains some value"
        if not eval_output["sql_query"] and row.correct_sql_response:
            eval_output["sql_overall_score"] = 0
            eval_output["sql_score_reasoning"] = "SQL query is blank, but instead should contain valid SQL"


        return eval_output

    async def get_results(self) -> list[dict[str, str]]:
        """
        Get validation results for all validation questions
        """

        db_schema = await self.get_db_schema()

        async def run_queries(row, db_schema):
            """
            Creates coroutine to get the validation results for a single question

            Args:
                row: the values for a single validation question
                db_schema: a string of the SQL databases' schema
            """
            await asyncio.sleep(2)
            # Frequent connection errors when running in a loop
            # Not sure if its due to poor internet or something else. The below code seemed to fix it
            count = 0
            while count < 7:
                try:
                    aam_output = await self.get_aam_reponse(row.question)
                    vanna_output = self.vanna_ai.get_vanna_reponse(row.question)

                    aam_eval_results = await self._request_llm_evaluation(
                        row,
                        aam_output,
                        db_schema
                    )

                    vanna_eval_results = await self._request_llm_evaluation(
                        row,
                        vanna_output,
                        db_schema
                    )
                    break
                except ConnectionError:
                    count += 1
                    continue
                except APIConnectionError:
                    count += 1
                    continue
            else:
                raise ConnectionError
            

            aam_evaluation = {
                f"aam_{key}": val
                for key, val in
                aam_eval_results.items()
            }
            vanna_evaluation = {
                f"vanna_{key}": val
                for key, val in
                vanna_eval_results.items()
            }


            return {
                "input": row.question,
                "expected_text_output": row.correct_answer,
                "expected_language_and_script": row.language,
                "expected_sql_output": row.correct_sql_response,
                **aam_evaluation,
                **vanna_evaluation
            }

        tasks = [
            run_queries(row, db_schema)
            for row in self.eval_questions.itertuples(index=False)
        ]
        
        responses = await asyncio.gather(*tasks)

        return responses

    async def create_eval_table(self) -> None:
        """Create table of results"""
        results = await self.get_results()
        df = pd.DataFrame(results)
        return df


Below showcases performance of AAM and Vanna.ai when Vanna.ai is NOT seperately trained on the database schema

In [117]:
vn_wo_schema = VannaModel(db_path=db_path, vanna_rag_model="tn-covid-db", sys_message=sys_message, db_description=db_description, llm_model="gpt-4o", run_sql=False)
comparisons = ModelComparisons(db_name = "tn_covid", db_path = db_path, sys_message = sys_message, db_description = db_description, vanna_ai = vn_wo_schema, llm_model= "gpt-4o")
eval_table_wo_db_schema = await comparisons.create_eval_table()

Adding documentation....
SQL Prompt: [{'role': 'system', 'content': "You are a SQLite expert. Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. \n===Additional Context \n\nGovernment and health officials in Tamil Nadu, India will ask you questions. You need to help them manage COVID cases and the availablity of beds in health facilities.\n\n- bed_vacancies_clinics_11_may: Each row identifies a district and a JSON object including the beds earmarked, occupied and available for COVID cases in the district clinics.- bed_vacancies_health_centers_and_district_hospitals_11_may: Each row identifies a district and the beds earmarked, occupied and available, with and without oxygen supply, and with and without ICU support, for COVID cases in the disctrict health centers and hospitals.- covid_cases_11_may: Each row identifies a district and the number of people who receiv

In [118]:
pd.options.display.max_colwidth = 200
eval_table_wo_db_schema

Unnamed: 0,input,expected_text_output,expected_language_and_script,expected_sql_output,aam_cost,aam_resposne_time,aam_text_response,aam_sql_query,aam_text_overall_score,aam_text_score_reasoning,aam_sql_overall_score,aam_sql_score_reasoning,vanna_cost,vanna_resposne_time,vanna_text_response,vanna_sql_query,vanna_text_overall_score,vanna_text_score_reasoning,vanna_sql_overall_score,vanna_sql_score_reasoning
0,How many active COVID cases are there in Chennai?,There are 37713 active COVID cases in Chennai as of May 11.,English,SELECT num_persons_under_treatment_on_11_may FROM covid_cases_11_may WHERE district_name = 'Chennai';,0.0013,0.001303,"The database you provided does not contain information about the number of active COVID cases. It only includes data on bed vacancies and COVID cases treated, discharged, and deceased. To find the...",,0,"The 'Answer' does not provide the specific number of active COVID cases as requested in the 'Question'. It instead explains the limitations of the provided database. Therefore, it does not meet cr...",0,"SQL query is blank, but instead should contain valid SQL",3e-05,28.199569,,,0,"The 'Answer' is missing, so it cannot be evaluated for similarity in meaning or whether it addresses the key elements of the question. The language and script of the question match the 'Correct La...",0,"SQL query is blank, but instead should contain valid SQL"
1,How many beds with oxygen supply are available in Vellore?,There are 53 beds available in Vellore with oxygen supply.,English,SELECT num_vacant_beds_with_o2_supply FROM bed_vacancies_health_centers_and_district_hospitals_11_may WHERE district_name = 'Vellore';,0.01289,28.241675,There are 53 beds with oxygen supply available in Vellore. This information was retrieved from the database that tracks bed vacancies in health centers and district hospitals as of May 11.,SELECT num_vacant_beds_with_o2_supply FROM bed_vacancies_health_centers_and_district_hospitals_11_may WHERE district_name = 'Vellore';,1,"The 'Answer' is similar in meaning to the 'Correct Answer', addresses the key elements of the 'Question', and the language and script match the 'Correct Language'.",1,The 'Answer' and 'Correct Answer' will result in the same output and the 'Answer' addresses the key elements of the 'Question'.,3e-05,12.730885,,,0,"The 'Answer' is missing, so it cannot be evaluated for similarity in meaning or addressing key elements. The language and script match, but the other criteria are not met.",0,"SQL query is blank, but instead should contain valid SQL"
2,Which district has the highest number of beds set aside for COVID patients?,Chennai has the highest number of beds set aside for COVID patients.,English,SELECT district_name FROM bed_vacancies_clinics_11_may ORDER BY num_earmarked_beds DESC LIMIT 1,0.018115,80.781952,"The district with the highest number of beds set aside for COVID patients is Chennai, with a total of 20,334 beds. This information was gathered by summing up the number of earmarked beds from bot...","SELECT district_name, SUM(num_earmarked_beds) AS total_earmarked_beds FROM (SELECT district_name, num_earmarked_beds FROM bed_vacancies_clinics_11_may UNION ALL SELECT district_name, num_earmarked...",1,"The 'Answer' is similar in meaning to the 'Correct Answer', addresses the key elements of the 'Question', and the language and script match the 'Correct Language'.",0,"The 'Answer' and 'Correct Answer' will not result in the same output. The 'Answer' aggregates data from multiple tables and sums up the beds, while the 'Correct Answer' only considers one table an...",3e-05,8.677429,,,0,"The 'Answer' is missing, so it cannot be evaluated for similarity in meaning or addressing key elements of the question. The language and script cannot be evaluated either.",0,"SQL query is blank, but instead should contain valid SQL"
3,What is the distribution of active COVID cases over districts?,District name | Active covid cases\nAriyalur|624\nChengalpattu|13038\nChennai|37713\nCoimbatore|14324\nCuddalore|2708\nDharmapuri|1662\nDindigul|1994\nErode|4421\nKallakurichi|1342\nKancheepuram|4...,English,"SELECT district_name, num_persons_under_treatment_on_11_may FROM covid_cases_11_may;",0.01898,40.978523,"The distribution of active COVID cases over districts as of 11th May is as follows: Ariyalur has 624 cases, Chengalpattu has 13,038 cases, Chennai has 37,713 cases, Coimbatore has 14,324 cases, Cu...","SELECT district_name, num_persons_under_treatment_on_11_may FROM covid_cases_11_may;",1,The 'Answer' is similar in meaning to the 'Correct Answer' as it provides the distribution of active COVID cases over districts. It addresses the key elements of the 'Question' by listing the acti...,1,Both the 'Answer' and 'Correct Answer' will result in the same output and address the key elements of the question.,3e-05,12.330791,,,0,"The 'Answer' is missing. Therefore, it cannot be evaluated for similarity in meaning, addressing key elements, or matching the correct language and script.",0,"SQL query is blank, but instead should contain valid SQL"
4,List the districts in order of highest to lowest based on the availability of beds with oxygen.,Erode\nSivaganga\nPudukkottai\nKanniyakumari\nDindigul\nChennai\nTiruchirappalli\nRamanathapuram\nTiruppur\nMadurai\nKrishnagiri\nNagapattinam\nThanjavur\nPerambalur\nAriyalur\nKarur\nTirunelveli\...,English,SELECT district_name FROM bed_vacancies_health_centers_and_district_hospitals_11_may ORDER BY num_vacant_beds_with_o2_supply DESC;,0.01837,65.754758,"The districts in order of highest to lowest based on the availability of beds with oxygen are as follows: Erode (522), Sivaganga (459), Pudukkottai (250), Kanniyakumari (209), Dindigul (181), Chen...","SELECT district_name, num_vacant_beds_with_o2_supply FROM bed_vacancies_health_centers_and_district_hospitals_11_may ORDER BY num_vacant_beds_with_o2_supply DESC;",1,The 'Answer' is similar in meaning to the 'Correct Answer' as it lists the districts in order of highest to lowest based on the availability of beds with oxygen. The 'Answer' addresses the key ele...,1,The 'Answer' will result in the same output as the 'Correct Answer' because it includes the district names ordered by the number of vacant beds with oxygen supply in descending order. The 'Answer'...,0.000105,15.016896,,,0,"The 'Answer' is not similar in meaning to the 'Correct Answer' and does not address the key elements of the 'Question'. The language and script match, but the other criteria are not met.",0,"SQL query is blank, but instead should contain valid SQL"
5,Which districts have > 200 beds available with oxygen supply?,Erode\nKanniyakumari\nPudukkottai\nSivaganga,English,SELECT district_name FROM bed_vacancies_health_centers_and_district_hospitals_11_may WHERE num_vacant_beds_with_o2_supply > 200;,0.01317,40.977429,"The districts that have more than 200 beds available with oxygen supply are Erode, Kanniyakumari, Pudukkottai, and Sivaganga. This information was obtained from the bed vacancies data of health ce...",SELECT district_name FROM bed_vacancies_health_centers_and_district_hospitals_11_may WHERE num_vacant_beds_with_o2_supply > 200;,1,The 'Answer' is similar in meaning to the 'Correct Answer' as it lists the same districts. The 'Answer' addresses the key elements of the 'Question' by specifying the districts with more than 200 ...,1,The 'Answer' and 'Correct Answer' will result in the same output and address the key elements of the 'Question'.,3e-05,12.437592,,,0,"The 'Answer' is missing. Therefore, it cannot be evaluated for similarity in meaning or addressing key elements of the 'Question'.",0,"SQL query is blank, but instead should contain valid SQL"


Below showcases performance of AAM and Vanna.ai when Vanna.ai is also trained with the database schema

In [127]:
vn_w_schema = VannaModel(db_path=db_path, vanna_rag_model="tn-covid-db-with-schema", sys_message=sys_message, db_description=db_description, llm_model="gpt-4o", run_sql=True)
comparisons_w_schema = ModelComparisons(db_name = "tn_covid", db_path = db_path, sys_message = sys_message, db_description = db_description, vanna_ai = vn_wo_schema, llm_model= "gpt-4o")
eval_table_w_db_schema = await comparisons_w_schema.create_eval_table()

Adding documentation....


ConnectionError: HTTPSConnectionPool(host='ask.vanna.ai', port=443): Max retries exceeded with url: /rpc (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7f606b28b230>: Failed to resolve 'ask.vanna.ai' ([Errno -3] Temporary failure in name resolution)"))

In [121]:
pd.options.display.max_colwidth = 200
eval_table_w_db_schema

Unnamed: 0,input,expected_text_output,expected_language_and_script,expected_sql_output,aam_cost,aam_resposne_time,aam_text_response,aam_sql_query,aam_text_overall_score,aam_text_score_reasoning,aam_sql_overall_score,aam_sql_score_reasoning,vanna_cost,vanna_resposne_time,vanna_text_response,vanna_sql_query,vanna_text_overall_score,vanna_text_score_reasoning,vanna_sql_overall_score,vanna_sql_score_reasoning
0,How many active COVID cases are there in Chennai?,There are 37713 active COVID cases in Chennai as of May 11.,English,SELECT num_persons_under_treatment_on_11_may FROM covid_cases_11_may WHERE district_name = 'Chennai';,0.0013,0.007174,"The database you provided does not contain information about the number of active COVID cases. It only includes data on bed vacancies and COVID cases treated, discharged, and deceased. To find the...",,0,"The 'Answer' does not provide the specific number of active COVID cases as requested in the 'Question'. It instead explains the limitations of the provided database. Therefore, it does not meet cr...",0,"SQL query is blank, but instead should contain valid SQL",0.002835,4.285162,"Empty DataFrame Columns: [""district"", active_cases] Index: []","SELECT \n ""district"",\n (""treated"" - ""discharged"" - ""died"") AS ""active_cases""\nFROM \n covid_cases_11_may\nWHERE \n ""district"" = 'Chennai';",0,"The 'Answer' does not provide a similar meaning to the 'Correct Answer' and does not address the key elements of the 'Question'. The language and script match, but the content is incorrect.",0,"The 'Answer' and 'Correct Answer' do not result in the same output. The 'Answer' calculates active cases by subtracting discharged and died from treated, while the 'Correct Answer' directly retrie..."
1,How many beds with oxygen supply are available in Vellore?,There are 53 beds available in Vellore with oxygen supply.,English,SELECT num_vacant_beds_with_o2_supply FROM bed_vacancies_health_centers_and_district_hospitals_11_may WHERE district_name = 'Vellore';,0.01289,4.369498,There are 53 beds with oxygen supply available in Vellore. This information was retrieved from the database that tracks bed vacancies in health centers and district hospitals as of May 11.,SELECT num_vacant_beds_with_o2_supply FROM bed_vacancies_health_centers_and_district_hospitals_11_may WHERE district_name = 'Vellore';,1,"The 'Answer' is similar in meaning to the 'Correct Answer', addresses the key elements of the 'Question', and the language and script match the 'Correct Language'.",1,The 'Answer' and 'Correct Answer' will result in the same output and the 'Answer' addresses the key elements of the 'Question'.,3e-05,0.914793,,,0,"The 'Answer' is missing, so it cannot be evaluated for similarity in meaning or addressing key elements. The language and script match, but the other criteria are not met.",0,"SQL query is blank, but instead should contain valid SQL"
2,Which district has the highest number of beds set aside for COVID patients?,Chennai has the highest number of beds set aside for COVID patients.,English,SELECT district_name FROM bed_vacancies_clinics_11_may ORDER BY num_earmarked_beds DESC LIMIT 1,0.018115,19.879277,"The district with the highest number of beds set aside for COVID patients is Chennai, with a total of 20,334 beds. This information was gathered by summing up the number of earmarked beds from bot...","SELECT district_name, SUM(num_earmarked_beds) AS total_earmarked_beds FROM (SELECT district_name, num_earmarked_beds FROM bed_vacancies_clinics_11_may UNION ALL SELECT district_name, num_earmarked...",1,"The 'Answer' is similar in meaning to the 'Correct Answer', addresses the key elements of the 'Question', and the language and script match the 'Correct Language'.",0,"The 'Answer' and 'Correct Answer' will not result in the same output. The 'Answer' aggregates data from multiple tables and sums up the beds, while the 'Correct Answer' only considers one table an...",3e-05,3.131347,,,0,"The 'Answer' is missing, so it cannot be evaluated for similarity in meaning or addressing key elements of the question. The language and script cannot be evaluated either.",0,"SQL query is blank, but instead should contain valid SQL"
3,What is the distribution of active COVID cases over districts?,District name | Active covid cases\nAriyalur|624\nChengalpattu|13038\nChennai|37713\nCoimbatore|14324\nCuddalore|2708\nDharmapuri|1662\nDindigul|1994\nErode|4421\nKallakurichi|1342\nKancheepuram|4...,English,"SELECT district_name, num_persons_under_treatment_on_11_may FROM covid_cases_11_may;",0.01898,18.557778,"The distribution of active COVID cases over districts as of 11th May is as follows: Ariyalur has 624 cases, Chengalpattu has 13,038 cases, Chennai has 37,713 cases, Coimbatore has 14,324 cases, Cu...","SELECT district_name, num_persons_under_treatment_on_11_may FROM covid_cases_11_may;",1,The 'Answer' is similar in meaning to the 'Correct Answer' as it provides the distribution of active COVID cases over districts. It addresses the key elements of the 'Question' by listing the acti...,1,Both the 'Answer' and 'Correct Answer' will result in the same output and address the key elements of the question.,3e-05,1.304011,,,0,"The 'Answer' is missing. Therefore, it cannot be evaluated for similarity in meaning, addressing key elements, or matching the correct language and script.",0,"SQL query is blank, but instead should contain valid SQL"
4,List the districts in order of highest to lowest based on the availability of beds with oxygen.,Erode\nSivaganga\nPudukkottai\nKanniyakumari\nDindigul\nChennai\nTiruchirappalli\nRamanathapuram\nTiruppur\nMadurai\nKrishnagiri\nNagapattinam\nThanjavur\nPerambalur\nAriyalur\nKarur\nTirunelveli\...,English,SELECT district_name FROM bed_vacancies_health_centers_and_district_hospitals_11_may ORDER BY num_vacant_beds_with_o2_supply DESC;,0.01837,5.272545,"The districts in order of highest to lowest based on the availability of beds with oxygen are as follows: Erode (522), Sivaganga (459), Pudukkottai (250), Kanniyakumari (209), Dindigul (181), Chen...","SELECT district_name, num_vacant_beds_with_o2_supply FROM bed_vacancies_health_centers_and_district_hospitals_11_may ORDER BY num_vacant_beds_with_o2_supply DESC;",1,The 'Answer' is similar in meaning to the 'Correct Answer' as it lists the districts in order of highest to lowest based on the availability of beds with oxygen. The 'Answer' addresses the key ele...,1,The 'Answer' will result in the same output as the 'Correct Answer' because it includes the district names ordered by the number of vacant beds with oxygen supply in descending order. The 'Answer'...,3e-05,12.209671,,,0,"The 'Answer' is missing. Therefore, it cannot be compared for similarity in meaning or addressing key elements. The language and script cannot be evaluated as there is no 'Answer' provided.",0,"SQL query is blank, but instead should contain valid SQL"
5,Which districts have > 200 beds available with oxygen supply?,Erode\nKanniyakumari\nPudukkottai\nSivaganga,English,SELECT district_name FROM bed_vacancies_health_centers_and_district_hospitals_11_may WHERE num_vacant_beds_with_o2_supply > 200;,0.01317,4.352164,"The districts that have more than 200 beds available with oxygen supply are Erode, Kanniyakumari, Pudukkottai, and Sivaganga. This information was obtained from the bed vacancies data of health ce...",SELECT district_name FROM bed_vacancies_health_centers_and_district_hospitals_11_may WHERE num_vacant_beds_with_o2_supply > 200;,1,The 'Answer' is similar in meaning to the 'Correct Answer' as it lists the same districts. The 'Answer' addresses the key elements of the 'Question' by specifying the districts with more than 200 ...,1,The 'Answer' and 'Correct Answer' will result in the same output and address the key elements of the 'Question'.,3e-05,1.059585,,,0,"The 'Answer' is missing. Therefore, it cannot be evaluated for similarity in meaning or addressing key elements of the 'Question'.",0,"SQL query is blank, but instead should contain valid SQL"
