Install necessary dependecies

In [1]:
!pip install google-cloud-bigquery google-cloud-aiplatform pandas --quiet

Authenticate GCloud SDK

In [2]:
!gcloud auth application-default login


You are running on a Google Compute Engine virtual machine.
The service credentials associated with this virtual machine
will automatically be used by Application Default
Credentials, so it is not necessary to use this command.

If you decide to proceed anyway, your user credentials may be visible
to others with access to this virtual machine. Are you sure you want
to authenticate with your personal account?

Do you want to continue (Y/n)?  Y

Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fapplicationdefaultauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=pghaePWtVWpciBWlwipoflKV4WCvmE&prompt=consent&token_

Import necessary packages

In [3]:
from google.cloud import bigquery
from vertexai.preview.language_models import ChatModel
from vertexai.preview.generative_models import GenerativeModel
import vertexai
import pandas as pd

Initialize constants

In [4]:
project_id="qwiklabs-gcp-03-f951e059a60d"
dataset_id="ads_dataset"
embedding_conn_name="embedding_conn"
embedding_model_name="ads_embedding_model"
dataset_table_name="ads_raw_dataset"
embedded_dataset_table_name="ads_vector_dataset"
bigquery_location="us"
vertexai_location="us-central1"
dataset_src="gs://labs.roitraining.com/alaska-dept-of-snow/alaska-dept-of-snow-faqs.csv"

Create big query client and loads CSV dataset into bigquery table

In [9]:
# Big query Load CSV data job configuration
job_config = bigquery.LoadJobConfig(
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1,
    autodetect=True,
    write_disposition="WRITE_TRUNCATE"
)

# Initializing Big Query client
bq = bigquery.Client(project=project_id, location=bigquery_location)

#Load CSV data to big query table
load_job = bq.load_table_from_uri(dataset_src, f"{project_id}.{dataset_id}.{dataset_table_name}", job_config=job_config)
load_job.result()
print("CSV Data loaded to Big Query table")

CSV Data loaded to Big Query table


Create an remote embedding model that can be used in future sql queries to vector search the data

In [10]:
#Create an embedding model
embed_model = f"""
  CREATE OR REPLACE MODEL `{dataset_id}.{embedding_model_name}`
  REMOTE WITH CONNECTION `{bigquery_location}.{embedding_conn_name}`
  OPTIONS (ENDPOINT = 'text-embedding-005')
"""
bq.query(embed_model).result()
print("Embedding model created")

Embedding model created


Create a new embedded data table in big query with source table as the csv loaded table. Use remote embedding connection to embed the data

In [11]:
#Create an embedded table with embedded data
embed_table_query = f"""
  CREATE OR REPLACE TABLE `{dataset_id}.{embedded_dataset_table_name}` AS
  SELECT *, ml_generate_embedding_result AS vector_embedding
  FROM ML.GENERATE_EMBEDDING(
    MODEL `{dataset_id}.{embedding_model_name}`,
    (
      SELECT CONCAT(string_field_0, ' ', string_field_1) AS content,
            string_field_0 AS question,
            string_field_1 AS answer
      FROM `{dataset_id}.{dataset_table_name}`
    )
  )
"""
bq.query(embed_table_query).result()
print(f"Embedded table populated: {dataset_id}.{embedded_dataset_table_name}")

Embedded table populated: ads_dataset.ads_vector_dataset
