In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# SQL Code Generation on Vertex AI using LangChain 🦜🔗

> **NOTE:** This notebook uses the PaLM generative model, which will reach its [discontinuation date in October 2024](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/text#model_versions). Please refer to [this updated notebook](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/NLP2SQL_using_dynamic_RAG.ipynb) for a version which uses the latest Gemini model.

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/language/use-cases/sql-code-generation/sql_code_generation_langchain.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Flanguage%2Fuse-cases%2Fsql-code-generation%2Fsql_code_generation_langchain.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/language/use-cases/sql-code-generation/sql_code_generation_langchain.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/bigquery/import?url=https://github.com/GoogleCloudPlatform/generative-ai/blob/main/language/use-cases/sql-code-generation/sql_code_generation_langchain.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/bigquery/v1/32px.svg" alt="BigQuery Studio logo"><br> Open in BigQuery Studio
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/language/use-cases/sql-code-generation/sql_code_generation_langchain.ipynb">
      <img width="32px" src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/language/use-cases/sql-code-generation/sql_code_generation_langchain.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/language/use-cases/sql-code-generation/sql_code_generation_langchain.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/language/use-cases/sql-code-generation/sql_code_generation_langchain.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/53/X_logo_2023_original.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/language/use-cases/sql-code-generation/sql_code_generation_langchain.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/language/use-cases/sql-code-generation/sql_code_generation_langchain.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>            

| | |
|-|-|
| Authors: | [Shubham Chawla](https://www.github.com/shubhamgoogle), [Roy Arsan](https://www.linkedin.com/in/arsan) |

## Overview
Large language models can be used for generating code, including SQL. In particular, models can convert natural language text into SQL queries. One common purpose is to enable users to query data without requiring knowledge of tables' names, data schema nor the specific SQL dialect or query engine of the underlying data warehouse like BigQuery.

This notebook covers prompt engineering best practices for SQL code generation with [LangChain Google Cloud Vertex AI](https://python.langchain.com/docs/integrations/llms/google_vertex_ai_palm) implementation, and puts in practice learnings from [SQL-PaLM: Improve Large Language Model Adaptation for text-to-SQL](https://arxiv.org/pdf/2306.00739.pdf). For example, the BigQuery dataset schema is retrieved and provided dynamically as context to the prompt, for grounding the LLM and personalizing its output. The notebook also demonstrates Retrieval Augmented Generation (RAG) by using [SemanticSimilarityExampleSelector](https://python.langchain.com/docs/modules/model_io/prompts/example_selector_types/similarity) from LangChain Example Selector to dynamically retrieve and pass the most relevant few shot examples to enrich the LLM prompt. This helps ensure most accurate and relevant LLM output, that is the generated SQL query, while limiting number of required LLM input tokens and thereby cost. The notebook also demonstrates simple model evaluation whereby the generated SQL queries are evaluated by executing them against the BigQuery dataset, and by comparing them with ground truth queries and corresponding results.

For this notebook, you generate SQL queries to analyze Cloud Audit Logs and answer critical security questions around activity in your own Google Cloud project. While this notebook uses BigQuery logs dataset, the concepts and approach presented here can be applied to other databases and datasets.

![NL2SQL flow](https://services.google.com/fh/files/misc/nl2sql_for_log_analytics2.png)

### Objective

By the end of the notebook, you should be able to:

* Use model to generate SQL queries based on Natural Language questions:
  * Use VertexAIEmbeddings to create embeddings
  * Use LangChain Example Selector to automatically select relevant examples for few-shot prompting
  * Providing custom dataset schemas as context
  * Formatting model output

* Evaluate model-generated queries by:
  * Executing sanitized queries against live dataset
  * Comparing queries (and their results) to ground truth queries using pandas dataframe equals check
  * Calculating model accuracy score

In addition, you can use this notebook to answer your own security questions from your own audit logs, such as:

- Any unusually high cloud API usage by any user identity over the last month?
- Any destructive actions by an unapproved identity over the last 7 days?
- Any unusual day-to-day spike in data volume accessed by any user this week?


## Getting Started

### Prerequisite
 If you haven't already done so, the only requirement is to [upgrade your existing log bucket](https://cloud.google.com/logging/docs/buckets#upgrade-bucket) to use Log Analytics which provides you with a linked BigQuery dataset with your own queryable logs data. This is a **one-click step without incurring additional costs**. By default, Cloud Audit Admin Activity logs are enabled, ingested and stored in every project's `_Required` bucket without any charges.

![one click prerequisite](https://services.google.com/fh/files/misc/upgrade_log_bucket.png)

### Install SDKs

In [None]:
# Install Vertex AI SDK to use for model predictions
%pip install google-cloud-aiplatform google-cloud-bigquery pandas --upgrade --user
%pip install --upgrade --quiet langchain langchain-core langchain-google-vertexai
%pip install langchain-community faiss-cpu

**Colab only:** Uncomment the following cell to restart the kernel or use the button to restart the kernel. For Vertex AI Workbench you can restart the terminal using the button on top.

In [None]:
# # Automatically restart kernel after installs so that your environment can access the new packages
# import IPython
# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

### Import Libraries

In [None]:
import re
import sys

from IPython.display import display
from google.cloud import aiplatform, bigquery
from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.prompt import PromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_google_vertexai import VertexAI, VertexAIEmbeddings
import numpy as np
import pandas as pd

### Set project and datasets for BigQuery

This is the project containing:
 - The linked BigQuery dataset `BQ_LINKED_DATASET` with your raw logs, and,
 - A new BigQuery dataset `BQ_PROCESSED_DATASET` you'll create to store the processed logs.

This project could be the same or a separate project than the one you're using for Vertex AI.

Make sure you have **BigQuery Data Viewer** role over `BQ_LINKED_DATASET` dataset.

In [None]:
PROJECT_ID = ""  # @param {type:"string"}
LOCATION_US = "US"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string
BQ_LINKED_DATASET = ""  # @param {type:"string"}
BQ_PROCESSED_DATASET = ""  # @param {type:"string"}

aiplatform.init(project=PROJECT_ID, location=LOCATION)

### Authenticating your notebook environment
* If you are using **Colab** to run this notebook, run the cell below and continue.
* If you are using **Vertex AI Workbench**, check out the setup instructions [here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env).

In [None]:
if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Embedding and Vertex AI LLM Model

In the current example we are using text-bison@002 large language model but you can use other Google provided models gemini, gemini-pro, ulta, etc. For embedding we are using textembedding-gecko with the latest version.

In [None]:
MODEL_ID = "text-bison@002"  # @param {type:"string"}
EMBEDDING_MODEL_ID = "textembedding-gecko@latest"  # @param {type:"string"}

### Import sample queries

You will now retrieve a list of 15 sample security questions and corresponding SQL queries from a CSV file. These security questions are variations from the open-source [Community Security Analytics](https://github.com/GoogleCloudPlatform/security-analytics). CSA provides a set of security questions and corresponding queries for BigQuery, Log Analytics and Chronicle.

We will use a subset of these queries as few-shot examples as part of the model prompt, and the remaining set for model evaluation.

Run the following to read the CSV file from a GCS bucket and load all records into an in-memory pandas DataFrame:

In [None]:
BUCKET_ID = "csa-datasets-public"  # @param {type:"string"}
FILENAME = "SQL_Generator_Example_Queries.csv"  # @param {type:"string"}
df = pd.read_csv(f"gs://{BUCKET_ID}/{FILENAME}", header=0)

### Extract train & eval datasets

Extract train & eval datasets and store in respective dataframes:

In [None]:
train_df = df.loc[df["Dataset"] == "Train", ["Question", "SQL Query"]]
eval_df = df.loc[df["Dataset"] == "Eval", ["Question", "SQL Query"]]
train_dict = (
    train_df[["Question", "SQL Query"]]
    .rename(columns={"SQL Query": "answer"})
    .rename(columns={"Question": "question"})
    .to_dict(orient="records")
)
train_df.head(2)

## Prepare the data

> You can skip this section if your raw logs are already processed and normalized in curated tables using [Dataform as part of Community Security Analytics](https://github.com/GoogleCloudPlatform/security-analytics/tree/main/dataform) (CSA). For more information on CSA and how to automatically and continuously build post-processed tables out of your raw logs, see this [Google Cloud blog post](https://cloud.google.com/blog/products/data-analytics/deploy-community-security-analytics-with-dataform).

Like any other AI/ML project, first thing is to prepare your data including datasets for few-shot prompting and subsequent evaluation. You'll preprocess the raw logs that reside in your BigQuery linked dataset into a summary table into your new BigQuery dataset. This table will contain the logs in aggregated form and also normalized into a simple schema. This allows you to unlock and scale ML analysis:
- From a computation point of view because the dataset is smaller and simple.
- From a talent point of view because researchers and analysts are not required to be familiar with the complex schema of raw logs ([LogEntry definition](https://cloud.google.com/logging/docs/reference/v2/rest/v2/LogEntry)).


### Create new dataset

In [None]:
!bq --location=LOCATION_US mk --dataset {BQ_PROJECT_ID}:{BQ_PROCESSED_DATASET}

Create new csa_4_01_summary_daily using log analytics BigQuery table

In [None]:
TABLE_NAME = "csa_4_01_summary_daily"
TABLE_ID = f"{PROJECT_ID}.{BQ_PROCESSED_DATASET}.{TABLE_NAME}"
SUMMARY_LOOKBACK_DAYS = 90

client = bigquery.Client(project=PROJECT_ID, location=LOCATION_US)
client.create_dataset(dataset=BQ_PROCESSED_DATASET, exists_ok=True)

job_config = bigquery.QueryJobConfig(
    destination=TABLE_ID, write_disposition="WRITE_TRUNCATE"
)

sql = f"""
SELECT
  EXTRACT(DATE FROM timestamp) AS day,
  proto_payload.audit_log.authentication_info.principal_email,
  ARRAY_AGG(DISTINCT proto_payload.audit_log.method_name IGNORE NULLS) AS actions,
  COUNT(*) AS counter
FROM `{PROJECT_ID}.{BQ_LINKED_DATASET}._AllLogs`
WHERE
  timestamp >=  TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL {SUMMARY_LOOKBACK_DAYS} DAY)
  AND proto_payload.audit_log.authentication_info.principal_email IS NOT NULL
  AND proto_payload.audit_log.method_name NOT LIKE "storage.%.get"
  AND proto_payload.audit_log.method_name NOT LIKE "v1.compute.%.list"
  AND proto_payload.audit_log.method_name NOT LIKE "beta.compute.%.list"
GROUP BY
  day,
  proto_payload.audit_log.authentication_info.principal_email
"""

# Start the query and save results in new table
query_job = client.query(sql, job_config=job_config)
result = query_job.result()  # Wait for the job to complete.

print(f"{result.total_rows} user action records loaded to table {TABLE_ID}")

### Build schema definition (compact version)

First, we need to build a concise schema definition of your dataset. As mentioned earlier, we'll use that as part of our prompt's context for grounding the results.

Retrieve table and column definitions from the `INFORMATION_SCHEMA` of your BigQuery dataset.

In [None]:
# Following SQL query will generate schema definition of your dataset

BQ_TABLES = df["Qualified table name"].replace("", np.nan).dropna().unique()
print(BQ_TABLES)
QUERY = f"""\
SELECT
    '[Schema (values)]: ' || '| log_summary | ' || STRING_AGG(table_values, ' | ') || ';' AS tables_definition,
    '[Column names (type)]: ' || STRING_AGG(column_names_types) || ';' AS columns_definition
FROM (
    SELECT
      table_name,
      table_name || ' : ' || STRING_AGG(column_name, ' , ') as table_values,
      STRING_AGG(table_name || ' : ' || column_name || ' (' || data_type || ')', ' | ') as column_names_types
    FROM {PROJECT_ID}.{BQ_PROCESSED_DATASET}.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS
    WHERE table_name IN {'(' + ",".join(map(lambda x: f"'{x}'", BQ_TABLES)) + ')'}
    GROUP BY table_name
    ORDER BY table_name
)
"""

# Create query job
query_job = client.query(QUERY)
# Get first row
schema = next(query_job.result())

# Build schema definition
schema_definition = f"""\
{schema.tables_definition}

{schema.columns_definition}
"""

print(schema_definition)

### Build Prompt

Lets create prompt using user input and few shots extracted dynamically using LangChain FewShotPromptTemplate

#### Create Vertex AI embeddings to create a text embedding  

In [None]:
embedding = VertexAIEmbeddings(model_name=EMBEDDING_MODEL_ID, project=PROJECT_ID)

#### Create Few Shot Prompt

The `SemanticSimilarityExampleSelector` selects examples based on a combination of which examples are most similar to the inputs

In [None]:
example_selector = SemanticSimilarityExampleSelector.from_examples(
    # This is the list of examples available to select from.
    train_dict,
    # This is the embedding class used to produce embeddings which are used to measure semantic similarity.
    embedding,
    # This is the VectorStore class that is used to store the embeddings and do a similarity search over.
    FAISS,
    # This is the number of examples to produce.
    k=2,
)

# Select the most similar example to the input.
question = "select user actions that contains the word 'delete' or 'remove'"
selected_examples = example_selector.select_examples({"question": question})
print(f"Examples most similar to the input: {question}")
for example in selected_examples:
    print("\n")
    for k, v in example.items():
        print(f"{k}: {v}")

#### Helper Function to Build Prompt

Below function will be utilised to converting user input into actual prompt with few shots and a prefix

In [None]:
def build_prompt(user_prompt, example_selector):
    prompt_template = f"""\
    This is a task converting text into GoogleSQL statement.
    We will first give you the dataset schema and then ask a question in text.
    You are asked to generate SQL statement which is valid for BigQuery.
    Remove any delimiters around answer such as "```"

    BigQuery tables schema definition:
    {schema_definition}
    Here are a few shot examples:
    """
    example_prompt = PromptTemplate(
        input_variables=["question", "answer"],
        template="question: {question}\nanswer: {answer}",
    )

    prompt = FewShotPromptTemplate(
        example_selector=example_selector,
        example_prompt=example_prompt,
        prefix=prompt_template,
        suffix="question: {question}\nanswer: ",
        input_variables=["question"],
    )
    final_prompt = prompt.format(question=user_prompt)
    return final_prompt

## Generate SQL queries

### Define helper function to generate SQL

`generate_sql()`: This function is used to retrieve a SQL query from the Vertex AI LLM model using the prompt template we have built thus far.

`execute_sql()`: This function is used to execute a SQL query against the live BigQuery dataset, and returning results as a dataframe.

`build_prompt()`: This function is used to create the final prompt which includes common prefix and suffix for all the prompts

Notice how `generate_sql()` uses `sanitize_output()` function to strip the response down to the SQL query itself before returning the results. Even though the model prompt includes instructions to tune the model output, there may still be enclosing quotes or code block backticks which need to be stripped out to avoid a subsequent SQL syntax error.

In [None]:
# Limit number of bytes processed as a guardrail for cost control
BQ_MAX_BYTES_BILLED = pow(2, 30)  # 1GB


def execute_sql(query: str):
    print("Executing SQL...")

    # Qualify table names with your project and dataset ID
    for table_name in BQ_TABLES:
        query = query.replace(
            table_name, f"{PROJECT_ID}.{BQ_PROCESSED_DATASET}.{table_name}"
        )

    # Validate the query by performing a dry run without incurring a charge
    job_config = bigquery.QueryJobConfig(use_query_cache=False, dry_run=True)
    try:
        response = client.query(query, job_config=job_config)
    except Exception as e:
        print("Error validating query:")
        print(e)
        return e

    print(f"Query will process {response.total_bytes_processed / 1024:.2f} KB.")

    # Execute the query
    job_config = bigquery.QueryJobConfig(
        use_query_cache=False, maximum_bytes_billed=BQ_MAX_BYTES_BILLED
    )
    try:
        response = client.query(query)
        df = response.to_dataframe()
    except Exception as e:
        print("Error executing query:")
        print(e)
        return e

    return df


# Strip text to include only the SQL code block with
def sanitize_output(text: str) -> str:
    # Strip whitespace and any potential backticks enclosing the code block
    text = text.strip()
    regex = re.compile(r"^\s*```(\w+)?|```\s*$")
    text = regex.sub("", text).strip()

    # Find and remove any trailing quote without corresponding opening quote
    if re.search(r'^[^"]*"$', text):
        text = text[:-1]
    # Find and remove any leading quote without corresponding closing quote
    if re.search(r'^"[^"]*$', text):
        text = text[1:]

    return text


# Call model using prompt and pre-defined parameters
def generate_sql(
    example_selector,
    prompt: str,
    temperature: float = 0.2,
    max_output_tokens: int = 1024,
    top_k: int = 40,
    top_p: float = 0.8,
) -> str:
    print("Generating SQL...")
    print("Number of input tokens: " + str(len(prompt)))

    model = VertexAI(
        model_name=MODEL_ID,
        temperature=temperature,
        max_output_tokens=max_output_tokens,
        top_k=top_k,
        top_p=top_p,
    )
    final_prompt = build_prompt(prompt, example_selector)
    print(final_prompt)
    text = model.invoke(final_prompt)
    print("Number of output tokens: " + str(len(text)))
    print("Response:")
    print(text)
    # Strip text to include only the SQL code block
    text = sanitize_output(text)
    print("Response stripped:")
    print(text)

    return text

### Example 1

Let's generate the SQL to answer this sample question:

*List all user actions that contains the word 'delete' or 'remove' over the last month. Include the user and the day in the results.*


In [None]:
user_prompt = "List all user actions that contains the word 'delete' or 'remove' over the last month. Include the user and the day in the results."

final_generated_prompt = build_prompt(user_prompt, example_selector)
print(final_generated_prompt)

Let's test the generated query with the live dataset in BigQuery.

In [None]:
output = generate_sql(example_selector, user_prompt)
print(output)

Let's test the generated query against your BigQuery dataset:

In [None]:
# Execute the query
query_result = execute_sql(output)
display(query_result.head(2))

### Example 2

Let's generate the SQL to answer this sample question:

*List all user actions that contains the word 'delete' or 'remove' over the last month. Include the user and the day in the results.*


In [None]:
user_prompt = "List any action containing IAM case-insensitive by any unapproved user over the last 7 days, where approved user include 'admin@example.com'"

final_generated_prompt = build_prompt(user_prompt, example_selector)
print(final_generated_prompt)

Let's test the generated query with the live dataset in BigQuery.

In [None]:
output = generate_sql(example_selector, user_prompt)
print(output)

Let's test the generated query against your BigQuery dataset:

In [None]:
# Execute the query
query_result = execute_sql(output)
display(query_result.head(2))

## Evaluate model

### Run model on evaluation dataset

Let's generate SQL queries for all questions in our evaluation dataset. That dataset includes both `Question` and the ground truth `SQL Query`. Run the following code to automatically call the model for each question in the dataset and record the response in a new column `Generated SQL Query`. This may take few minutes as model calls are done serially.


In [None]:
eval_df["Generated SQL Query"] = eval_df["Question"].apply(
    lambda x: generate_sql(example_selector, x)
)

### Compare output result

In the next code cell, we'll execute the original SQL query and then compare its output directly to the output of the generated SQL query.

In [None]:
def compare_dataframes(sql_query, generated_sql_query):
    """Compares two pandas DataFrames row-wise using columns from the second DataFrame.
    Args:
        SQL Query, Generated SQL Query
    Returns:
        True if output of both the SQL queries matches otherwise False
    """
    df1 = execute_sql(sql_query)
    df2 = execute_sql(generated_sql_query)

    # If generated query returned an error instead of a dataframe with results:
    if not isinstance(df2, pd.DataFrame):
        return False

    try:
        df2 = df2[df1.columns]
    except KeyError:
        # Columns in results of ground truth query are missing
        # from results returned by generated query
        return False

    comparison_result = df2.eq(df1)
    matching_rows = comparison_result.all(axis=1)
    matching_count = matching_rows.sum()
    # return df1, df2
    return True if matching_count == len(df1) else False


eval_df["Data Match"] = eval_df.apply(
    lambda x: compare_dataframes(x["SQL Query"], x["Generated SQL Query"]), axis=1
)
# eval_df["sql_query_output"],eval_df["generated_sql_query_output"] = eval_df.apply(lambda x: compare_dataframes(x["SQL Query"], x["Generated SQL Query"]), axis=1)

# Note: To save the output data to the final dataframe, make these changes: 1. Uncomment lines 26 and 30. 2. Comment out line 29.

## Final Result

In the next cell, we'll calculate the model's final score. This score represents the percentage of successful matches between the original and generated queries, as indicated in the 'Data Match' column.

### Score

In [None]:
def get_prcntg_match(eval_df):
    return round(eval_df["Data Match"].sum() / len(eval_df) * 100)


prcntg_match = get_prcntg_match(eval_df)
print(f"Final Score based on the percentage of data match: {prcntg_match}%")

### Output

In [None]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", -1)
display(eval_df)