In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# BQML - LLM Examples

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/language/examples/bqml/llm_nlp_examples.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/language/examples/bqml/llm_nlp_examples.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/language/examples/bqml/llm_nlp_examples.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>

## Overview
BigQuery ML (BQML) now integrates with Vertex LLMs (PaLM 2 for Text). In this tutorial, you are shown examples of how to use this feature to run NLP tasks against data stored in BigQuery.


### Objectives
The objective is to demonstrate some of the many ways LLMs can be applied to your BigQuery data using BigQuery ML.


You will execute simple SQL statements that call the Vertex AI API with the (`ML.GENERATE_TEXT`) function to:

- Summmarize and classify text
- Perform entity recognition
- Enrich data
- Run Sentiment Analysis


### Services and Costs
This tutorial uses the following Google Cloud data analytics and ML services, they are billable components of Google Cloud:

* BigQuery & BigQuery ML <a href="https://cloud.google.com/bigquery/pricing" target="_blank">(pricing)</a>
* Vertex AI API <a href="https://cloud.google.com/vertex-ai/pricing" target="_blank">(pricing)</a>

Check out the [BQML Pricing page](https://cloud.google.com/bigquery/pricing#bqml) for a breakdown of costs are applied across these services.

Use the [Pricing
Calculator](https://cloud.google.com/products/calculator)
to generate a cost estimate based on your projected usage.

### Installation

Install the following packages required to execute this notebook.

In [None]:
!pip install google-cloud-bigquery-connection

## Before you begin

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

3. [Enable BigQuery Connection API](https://console.cloud.google.com/apis/library/bigqueryconnection.googleapis.com?_ga=2.83970457.1667545569.1683624898-1324157630.1682064685),
[Enable Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com&_ga=2.121353995.2053869978.1687859460-1056062237.1685695596)

4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk).

#### Set your project ID

**If you don't know your project ID**, try the following:
* Run `gcloud config list`.
* Run `gcloud projects list`.
* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = ""  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

#### Region

You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = ""  # @param {type: "string"}

#### Setup Project Variables

In [None]:
DATASET_ID = "bqml_llm"
CONN_NAME = "bqml_llm_conn"
CONN_SERVICE_ACCOUNT = ""
LLM_MODEL_NAME = "bqml-vertex-llm"

### Authenticate to your Google Cloud account
Run the cell below and follow the instructions when prompted to authenticate your account via OAuth.

In [None]:
from google.colab import auth

auth.authenticate_user()

### Import libraries


In [None]:
from google.cloud import bigquery
from google.cloud import bigquery_connection_v1 as bq_connection

### Create BigQuery Cloud resource connection
You will need to create a [Cloud resource connection](https://cloud.google.com/bigquery/docs/create-cloud-resource-connection) to enable BigQuery to interact with Vertex AI services.

In [None]:
client = bq_connection.ConnectionServiceClient()
new_conn_parent = f"projects/{PROJECT_ID}/locations/{REGION}"
exists_conn_parent = f"projects/{PROJECT_ID}/locations/{REGION}/connections/{CONN_NAME}"
cloud_resource_properties = bq_connection.CloudResourceProperties({})

try:
    request = client.get_connection(
        request=bq_connection.GetConnectionRequest(name=exists_conn_parent)
    )
    CONN_SERVICE_ACCOUNT = f"serviceAccount:{request.cloud_resource.service_account_id}"
except Exception:
    connection = bq_connection.types.Connection(
        {"friendly_name": CONN_NAME, "cloud_resource": cloud_resource_properties}
    )
    request = bq_connection.CreateConnectionRequest(
        {
            "parent": new_conn_parent,
            "connection_id": CONN_NAME,
            "connection": connection,
        }
    )
    response = client.create_connection(request)
    CONN_SERVICE_ACCOUNT = (
        f"serviceAccount:{response.cloud_resource.service_account_id}"
    )
print(CONN_SERVICE_ACCOUNT)

### Set permissions for Service Account
The resource connection service account requires certain project-level permissions which are outlined in the <a href="https://cloud.google.com/bigquery/docs/bigquery-ml-remote-model-tutorial#set_up_access" target="_blank">Vertex AI function documentation</a>.

In [None]:
!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/serviceusage.serviceUsageConsumer'
!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/bigquery.connectionUser'
!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/aiplatform.user'

## Prepare BigQuery Dataset

### Create a BigQuery Dataset
You will need a BigQuery dataset to store your ML model and tables. Run the following to create a dataset within your project called `bqml_llm`:

In [None]:
client = bigquery.Client(project=PROJECT_ID)
dataset = client.create_dataset(DATASET_ID, exists_ok=True)
print(f"Dataset {dataset.dataset_id} created.")

Create a wrapper to use the BigQuery client to run queries and return the result:

In [None]:
# Wrapper to use BigQuery client to run query and return result


def run_bq_query(sql: str):
    """
    Input: SQL query, as a string, to execute in BigQuery
    Returns the query results or error, if any
    """
    try:
        query_job = client.query(sql)
        result = query_job.result()
        print(f"JOB ID: {query_job.job_id} STATUS: {query_job.state}")
        return result

    except Exception as e:
        raise Exception(str(e))

##Executing LLM using BigQuery ML

To execute LLMs in BQML you will first need to create the LLM model and then execute the (`ML.GENERATE_TEXT`) function with a prompt. This can all be done in SQL.

###Create Vertex AI Model

You can create a Vertex AI remote model in BigQuery using the CREATE MODEL statement:

In [None]:
sql = f"""
      CREATE OR REPLACE MODEL
        `{PROJECT_ID}.{DATASET_ID}.{LLM_MODEL_NAME}`
        REMOTE WITH CONNECTION
          `{PROJECT_ID}.{REGION}.{CONN_NAME}`
          OPTIONS ( remote_service_type = 'CLOUD_AI_LARGE_LANGUAGE_MODEL_V1');
      """
result = run_bq_query(sql)

###Using the LLM model
You can now use the (`ML.GENERATE_TEXT`) function to run advanced NLP tasks against free text or data stored in BigQuery.

[The BQML documentation](https://cloud.google.com/bigquery/docs/generate-text#generate_text) gives further details on the parameters used: `temperature, max_output_tokens, top_p and top_k.`

*Note: The table column with the input text must have the alias 'prompt'*


In [None]:
PROMPT = "Describe a cat in one paragraph"

sql = f"""
          SELECT
            *
          FROM
            ML.GENERATE_TEXT(
              MODEL `{PROJECT_ID}.{DATASET_ID}.{LLM_MODEL_NAME}`,
              (
              SELECT
                '{PROMPT}' AS prompt
              ),
              STRUCT
              (
                1 AS temperature,
                1024 AS max_output_tokens,
                0.8 AS top_p,
                40 AS top_k,
                TRUE AS flatten_json_output
              ));
        """
result = run_bq_query(sql)

The table of results will include JSON that can be parsed to extract the content result.

Setting the `flatten_json_output` paramter to TRUE will return a flattened JSON as a string: `ml_generate_text_llm_result`.

For the rest of the examples, we will just display the prompt and `ml_generate_text_llm_result` for simplicity.

In [None]:
result.to_dataframe()

##On to the fun stuff - Example Use Cases!

The following examples explore using the BQML LLM model for content creation, text summarization, classification, entity recognition, data enrichment and sentiment analysis.

When writing your own prompts, we recommend you first review these [Prompt Design best practices](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/language/intro_prompt_design.ipynb).

####Content Creation


In [None]:
# Use LLM to create marketing campaign copy based on recipient demographic and spending data
sql = f"""
          WITH
            latest_sale AS (
            SELECT u.id,u.first_name,u.email,u.postal_code,u.country,o.order_id,o.created_at,p.category,p.name
            FROM
              `bigquery-public-data.thelook_ecommerce.users` u
            JOIN (
              SELECT user_id,order_id,created_at,product_id,
                ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_at DESC) AS rn
              FROM
                `bigquery-public-data.thelook_ecommerce.order_items` ) o
            ON u.id = o.user_id
            JOIN `bigquery-public-data.thelook_ecommerce.products` p
            ON o.product_id = p.id
            WHERE o.rn = 1 AND p.category = "Active" and u.country = "United States"
          )

          SELECT
            ml_generate_text_llm_result
          FROM
            ML.GENERATE_TEXT(
              MODEL `{PROJECT_ID}.{DATASET_ID}.{LLM_MODEL_NAME}`,
              (
              SELECT
                CONCAT('A user bought a product with this description: ', name,' Write a follow up marketing email mentioning the high-level product category of their purchase in one word, for example "We hope you are enjoying your new t-shirt", and encouraging the individual to shop with the store again using the coupon code RETURN10 for 10% off their next purchase. Provide two local outdoor activities they could pursue with their new purchase. They live in the zip code ', postal_code,'. Don not mention the brand of the product, just sign off the email with "-TheLook." Address the email to: ', first_name) AS prompt
              FROM
                latest_sale
              LIMIT
                5),
              STRUCT(1 AS temperature,
                1024 AS max_output_tokens,
                0.8 AS top_p,
                40 AS top_k,
                TRUE AS flatten_json_output));
        """
result = run_bq_query(sql)
result.to_dataframe()

####Text Summarization

In [None]:
PROMPT = "Please summarize this BBC news article into 25 words or less: "

sql = f"""
          SELECT
            prompt,
            ml_generate_text_llm_result
          FROM
            ML.GENERATE_TEXT(
              MODEL `{PROJECT_ID}.{DATASET_ID}.{LLM_MODEL_NAME}`,
              (
              SELECT
                CONCAT('{PROMPT}', body) AS prompt
              FROM
                `bigquery-public-data.bbc_news.fulltext`
              LIMIT
                5),
              STRUCT(1 AS temperature, 1024 AS max_output_tokens, 0.8 AS top_p, 40 AS top_k, TRUE AS flatten_json_output));
        """
result = run_bq_query(sql)
result.to_dataframe()

In [None]:
PROMPT = "Please rewrite this article to enable easier understanding for a person with a B1 level of English. Article: "

sql = f"""
          SELECT
            prompt,
            ml_generate_text_llm_result
          FROM
            ML.GENERATE_TEXT(
              MODEL `{PROJECT_ID}.{DATASET_ID}.{LLM_MODEL_NAME}`,
              (
              SELECT
                CONCAT('{PROMPT}', body) AS prompt
              FROM  `bigquery-public-data.bbc_news.fulltext`
              LIMIT 5
              ),
              STRUCT(1 AS temperature, 1024 AS max_output_tokens, 0.8 AS top_p, 40 AS top_k, TRUE AS flatten_json_output));
        """
result = run_bq_query(sql)
result.to_dataframe()

####Text Classification

In [None]:
sql = f"""
          SELECT
            prompt,
            ml_generate_text_llm_result
          FROM
            ML.GENERATE_TEXT(
              MODEL `{PROJECT_ID}.{DATASET_ID}.{LLM_MODEL_NAME}`,
              (
              SELECT
                CONCAT("Please categorize BBC news article into either tech, sport, business, politics, or entertainment and return the category. Here's an example. News article: Intel has unveiled research that could mean data is soon being moved around chips at the speed of light., Category: Tech ; News article: ", body, ", Category:") AS prompt
              FROM
                `bigquery-public-data.bbc_news.fulltext`
              LIMIT
                5),
              STRUCT(1 AS temperature, 1024 AS max_output_tokens, 0.8 AS top_p, 40 AS top_k, TRUE AS flatten_json_output));
        """
result = run_bq_query(sql)
result.to_dataframe()

####Entity Recognition

In [None]:
PROMPT = "Please return a bullet-point list of all sentences in this article that cite a statistic: "

sql = f"""
          SELECT
            prompt,
            ml_generate_text_llm_result
          FROM
            ML.GENERATE_TEXT(
              MODEL `{PROJECT_ID}.{DATASET_ID}.{LLM_MODEL_NAME}`,
              (
              SELECT
                CONCAT('{PROMPT}', body) AS prompt
              FROM
                `bigquery-public-data.bbc_news.fulltext`
              LIMIT
                5),
              STRUCT(1 AS temperature, 1024 AS max_output_tokens, 0.8 AS top_p, 40 AS top_k, TRUE AS flatten_json_output));
        """
result = run_bq_query(sql)
result.to_dataframe()

In [None]:
# Entity Extraction on a product description using LLM
sql = f"""
          SELECT
            prompt,
            ml_generate_text_llm_result
          FROM
            ML.GENERATE_TEXT(
              MODEL `{PROJECT_ID}.{DATASET_ID}.{LLM_MODEL_NAME}`,
              (
              SELECT
                CONCAT("Please return the brand name listed in this product description. Here is an example. Product: TYR Sport Men's Solid Durafast Jammer Swim Suit, Brand: TYR ; Product: ", name," Brand: ") AS prompt
              FROM
                `bigquery-public-data.thelook_ecommerce.products`
              LIMIT
                5),
              STRUCT(1 AS temperature, 1024 AS max_output_tokens, 0.8 AS top_p, 40 AS top_k, TRUE AS flatten_json_output));
        """
result = run_bq_query(sql)
result.to_dataframe()

####Data Enrichment

In [None]:
sql = f"""
          SELECT
            prompt,
            ml_generate_text_llm_result
          FROM
            ML.GENERATE_TEXT(
              MODEL `{PROJECT_ID}.{DATASET_ID}.{LLM_MODEL_NAME}`,
              (
              SELECT
                CONCAT("Please provide the Koppen climate classification for a given US Zip Code. Here's an example. Zip Code: '36773', Koppen Classification: 'Cfa'; Zip Code: ", zipcode, "Koppen Classification: ") AS prompt
              FROM
                `bigquery-public-data.census_bureau_usa.population_by_zip_2010`
              LIMIT
                5 ),
              STRUCT(0.2 AS temperature,
                50 AS max_output_tokens,
                0.8 AS top_p,
                40 AS top_k, TRUE AS flatten_json_output))
          """
result = run_bq_query(sql)
result.to_dataframe()

####Sentiment Analysis

In [None]:
sql = f"""
          SELECT
            prompt,
            review,
            ml_generate_text_llm_result,

          FROM
            ML.GENERATE_TEXT(
              MODEL `{PROJECT_ID}.{DATASET_ID}.{LLM_MODEL_NAME}`,
              (
              SELECT
                CONCAT("Please categorize this movie review as either 'Positive', 'Negative' or 'Neutral'. Here's an example. Review: 'I dislike this movie', Sentiment: Negative ; Review: ", review, ", Category:") AS prompt,
                review
              FROM
                `bigquery-public-data.imdb.reviews`
              WHERE
                UPPER(title) = 'TROY'
              LIMIT
                10 ),
              STRUCT(0.2 AS temperature,
                50 AS max_output_tokens,
                0.8 AS top_p,
                40 AS top_k, TRUE AS flatten_json_output))
          """
result = run_bq_query(sql)
result.to_dataframe()

## Cleaning Up
To clean up all Google Cloud resources used in this project, you can <a href="https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects" target="_blank">delete the Google Cloud
project</a> you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

In [None]:
# Delete BigQuery dataset, including the BigQuery ML models you just created, and the BigQuery Connection
! bq rm -r -f $PROJECT_ID:$DATASET_ID
! bq rm --connection --project_id=$PROJECT_ID --location=$REGION $CONN_NAME