In [1]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/GoogleCloudPlatform/python-docs-samples/blob/main/alloydb/notebooks/generate_batch_embeddings.ipynb)

---
# Introduction

This notebook shows you how to generate batch vector embeddings and store them in an AlloyDB database.

With the steps listed here, you can dynamically build a batch of text chunks to embed based on character length of the source data in order to get more results per inference, leading to much more efficient embeddings generation. The process uses the psycopg library to efficiently load the embeddings into AlloyDB after they are generated. These techniques can significantly speed up the process of generating large batches of embeddings and storing them in AlloyDB vs using the native embedding() function (about 6.5x faster based on limited testing).

## What you'll need

* A Google Cloud Account and Google Cloud Project

# Setup and Requirements

In the following instructions you will learn to:

1. Install required dependencies for our application
2. Set up authentication for our project
3. Set up a AlloyDB for PostgreSQL Instance
4. Import the data used by our application

## Install dependencies

In [None]:
%pip install langchain-google-alloydb-pg==0.7.0 langchain==0.3.3 langchain-google-vertexai==2.0.4 google-cloud-alloydb-connector[pg8000]==1.4.0

## Authenticate to Google Cloud within Colab
In order to access your Google Cloud Project from this notebook, you will need to Authenticate as an IAM user.

In [3]:
from google.colab import auth

auth.authenticate_user()

## Connect Your Google Cloud Project

In [None]:
# @markdown Please fill in the value below with your GCP project ID and then run the cell.

# Please fill in these values.
project_id = ""  # @param {type:"string"}

# Quick input validations.
assert project_id, "⚠️ Please provide a Google Cloud project ID"

# Configure gcloud.
!gcloud config set project {project_id}

## Enable APIs for AlloyDB and Vertex AI within your project

You will need to enable these APIs in order to create an AlloyDB database and utilize Vertex AI as an embeddings service!

In [None]:
# enable GCP services
!gcloud services enable alloydb.googleapis.com aiplatform.googleapis.com

## Set up AlloyDB
You will need a Postgres AlloyDB instance for the following stages of this notebook. Please set the following variables.

In [6]:
# @markdown Please fill in the both the Google Cloud region and name of your AlloyDB instance. Once filled in, run the cell.

# Please fill in these values.
region = ""  # @param {type:"string"}
cluster_name = ""  # @param {type:"string"}
instance_name = ""  # @param {type:"string"}
database_name = ""  # @param {type:"string"}
password = input("Please provide a password to be used for 'postgres' database user: ")

### Create an AlloyDB Instance
If you have already created an AlloyDB Cluster and Instance, you can skip these steps and skip to the connectivity section.

> ⏳ - Creating an AlloyDB cluster may take a few minutes.

In [7]:
# Quick input validations.
assert region, "⚠️ Please provide a Google Cloud region"
assert instance_name, "⚠️ Please provide the name of your instance"
assert database_name, "⚠️ Please provide the name of your database_name"

# create the AlloyDB Cluster
!gcloud beta alloydb clusters create {cluster_name} --password={password} --region={region}

Create an instance attached to our cluster with the following command.
> ⏳ - Creating an AlloyDB instance may take a few minutes.

In [8]:
!gcloud beta alloydb instances create {instance_name} --instance-type=PRIMARY --cpu-count=2 --region={region} --cluster={cluster_name}

To connect to your AlloyDB instance from this notebook, you will need to enable public IP on your instance. Alternatively, you can follow [these instructions](https://cloud.google.com/alloydb/docs/connect-external) to connect to an AlloyDB for PostgreSQL instance with Private IP from outside your VPC.

In [9]:
!gcloud beta alloydb instances update {instance_name} --region={region} --cluster={cluster_name} --assign-inbound-public-ip=ASSIGN_IPV4

Now create a connection pool to connect to our instance.

In [None]:
from google.cloud.alloydb.connector import Connector, IPTypes
import sqlalchemy


connection_string = f"projects/{project_id}/locations/{region}/clusters/{cluster_name}/instances/{instance_name}"
# initialize Connector object
connector = Connector()


# function to return the database connection
def getconn():
    conn = connector.connect(
        connection_string,
        "pg8000",
        user="postgres",
        password=password,
        db="postgres",
        enable_iam_auth=False,
        ip_type=IPTypes.PUBLIC,
    )
    return conn


# create connection pool
pool = sqlalchemy.create_engine(
    "postgresql+pg8000://", creator=getconn, isolation_level="AUTOCOMMIT"
)

### Create a Database

Next you will create database to store the data for this application using the connection pool. Enabling public IP takes a few minutes, you may get an error that there is no public IP address. Please wait and retry this step if you hit an error!

In [12]:
with pool.connect() as db_conn:
    db_conn.execute(sqlalchemy.text(f"CREATE DATABASE {database_name}"))
connector.close()

#### Connect to our New Database

Now you will add in a connection function that connects to your new database!

In [13]:
from google.cloud.alloydb.connector import Connector, IPTypes
import sqlalchemy


connection_string = f"projects/{project_id}/locations/{region}/clusters/{cluster_name}/instances/{instance_name}"
# initialize Connector object
connector = Connector()


# function to return the database connection
def getconn():
    conn = connector.connect(
        connection_string,
        "pg8000",
        user="postgres",
        password=password,
        db=database_name,
        enable_iam_auth=False,
        ip_type=IPTypes.PUBLIC,
    )
    return conn


# create connection pool
pool = sqlalchemy.create_engine("postgresql+pg8000://", creator=getconn)

### Import data to your database

The following code has been prepared code to help insert the CSV data into your AlloyDB for PostgreSQL database.

Download the CSV file:

In [14]:
!gsutil cp gs://twisha-dev_cloudbuild/investments_csv /content/investments.csv

The download can be verified by the following command or using the "Files" tab.

In [15]:
!ls

In this next step you will:

1. Create the table into store data
2. And insert the data from the CSV into the database table

In [16]:
table_name = "investments"

In [17]:
import pandas as pd

create_table_cmd = sqlalchemy.text(
    f'CREATE TABLE {table_name} ( \
    id SERIAL PRIMARY KEY, \
    ticker VARCHAR(255) NOT NULL UNIQUE, \
    etf BOOLEAN, \
    market VARCHAR(255), \
    rating TEXT,  \
    overview TEXT, \
    overview_embedding VECTOR (768), \
    analysis TEXT,  \
    analysis_embedding VECTOR (768) \
    )'
)

data = "/content/investments.csv"

df = pd.read_csv(data)
insert_data_cmd = sqlalchemy.text(
    """
    INSERT INTO investments (id, ticker, etf, market,
      rating, overview, analysis) VALUES (:id, :ticker, :etf, :market,
      :rating, :overview, :analysis)
    """
)

parameter_map = [
    {
        "id": row["id"],
        "ticker": row["ticker"],
        "etf": row["etf"],
        "market": row["market"],
        "rating": row["rating"],
        "overview": row["overview"],
        "analysis": row["analysis"],
    }
    for index, row in df.iterrows()
]

with pool.connect() as db_conn:
    db_conn.execute(create_table_cmd)
    db_conn.execute(
        insert_data_cmd,
        parameter_map,
    )
    db_conn.commit()
connector.close()

### Fetch Source Data to Embed

In [None]:
from sqlalchemy.engine.row import Row
from typing import List

In [19]:
def get_source_data() -> List[dict[str, Row]]:
  sql = f"""SELECT id, overview, analysis FROM {table_name};"""
  col_names = ["id", "overview", "analysis"]

  print(f"Running SQL query: {sql}")

  with pool.connect() as db_conn:
    rows = db_conn.execute(sqlalchemy.text(sql))
    source_array = [dict(zip(col_names, row)) for row in rows]
    db_conn.commit()

  connector.close()

  return source_array

### Define Batching Function

This helper function dynamically builds batches of text chunks to efficiently generate multiple embeddings with each call to the API.



In [20]:
max_tokens = 20000

In [21]:
# Function to build batches for embedding based on max tokens/characters
def build_batch_array(source_array: List[dict[str, Row]], column_to_embed: str) -> List[dict[str, Row]]:
  global index_pointer  # Assumes index_pointer is defined globally
  global batch_char_count  # Assumes batch_char_count is defined globally
  global total_char_count  # Assumes total_char_count is defined globally

  batch_array = []
  current_char_count = 0
  max_char_count = max_tokens * 3  # Approximate characters per token

  while current_char_count < max_char_count and index_pointer < len(source_array):
      obj = source_array[index_pointer]
      text_to_embed = obj[column_to_embed]
      text_char_count = len(text_to_embed)

      if current_char_count + text_char_count <= max_char_count:
          batch_array.append(obj)
          current_char_count += text_char_count
          index_pointer += 1
      else:
          break  # Exit loop if adding the next object exceeds the limit

  batch_char_count = current_char_count
  total_char_count += batch_char_count

  return batch_array

### Define Embedding Functions


In [22]:
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

def embed_text(
    texts: List[str],
    task: str = "SEMANTIC_SIMILARITY",
    model_name: str = "textembedding-gecko@003",
) -> List[List[float]]:
    model = TextEmbeddingModel.from_pretrained(model_name)
    inputs = [TextEmbeddingInput(text, task) for text in texts]
    embeddings = model.get_embeddings(inputs)
    return [embedding.values for embedding in embeddings]


def embed_objects(source_array: List[dict[str, Row]], column_to_embed: str) -> List[dict[str, Row]]:
    global index_pointer  # Assumes index_pointer is defined globally
    global batch_count  # Assumes batch_count is defined globally
    global batch_char_count  # Assumes batch_char_count is defined globally
    global total_char_count  # Assumes total_char_count is defined globally

    total_objects = len(source_array)
    print(f"Beginning source_array size: {total_objects}")

    result_array = []

    while index_pointer < total_objects:
        batch_array = build_batch_array(source_array, column_to_embed)
        print("Batch array size", len(batch_array))

        if batch_array:
            batch_count += 1
            print(f"Processing batch {batch_count} with size: {len(batch_array)}. "
                  f"Progress: {index_pointer} / {total_objects}. "
                  f"Character count (batch): {batch_char_count}. "
                  f"Character count (cumulative): {total_char_count}")

            texts_to_embed = [obj[column_to_embed] for obj in batch_array]
            embeddings = embed_text(texts_to_embed)

            for i, obj in enumerate(batch_array):
                obj['embedding'] = embeddings[i]
                result_array.append(obj)

    return result_array

### Define Bulk Load and Update Functions

In [23]:
import tempfile
import csv
import os

In [40]:
def insert_to_temp_table(temp_table_name: str, column_to_embed: str, object_array: List[dict[str, Row]]) -> None:
    with pool.connect() as db_conn:
        for obj in object_array:
            db_conn.execute(sqlalchemy.text(
                f"""INSERT INTO {temp_table_name} (id, col_name, embedding)
                VALUES (:id, :col_name, :embedding)"""
            ),  {'id': obj['id'], 'col_name': column_to_embed, 'embedding': obj['embedding']})
        db_conn.commit()
    connector.close()

In [25]:
# Functions to manage temporary table and update the target table
def create_temp_table(column_to_embed: str) -> None:
    temp_table_name = f"{column_to_embed}_embeddings_temp"
    # Update the SQL query below to match your environment
    sql = f"""
    DROP TABLE IF EXISTS {temp_table_name};
    CREATE TABLE {temp_table_name} (
        id INTEGER PRIMARY KEY,
        col_name TEXT,
        embedding REAL[]
    );
    """
    with pool.connect() as db_conn:
      db_conn.execute(sqlalchemy.text(sql))
      db_conn.commit()
    connector.close()

    return temp_table_name


def update_target_table(temp_table_name: str, target_table_name: str, column_to_embed: str) -> None:
    # Update the SQL query below to match your environment
    sql = f"""
    UPDATE {target_table_name}
    SET {column_to_embed}_embedding = {temp_table_name}.embedding
    FROM {temp_table_name}
    WHERE {target_table_name}.id = {temp_table_name}.id;
    """

    with pool.connect() as db_conn:
      db_conn.execute(sqlalchemy.text(sql))
      db_conn.commit()
    connector.close()


def drop_temp_table(temp_table_name: str) -> None:
    sql = f"""DROP TABLE {temp_table_name};"""

    with pool.connect() as db_conn:
      db_conn.execute(sqlalchemy.text(sql))
      db_conn.commit()
    connector.close()

### Run the embedding process

In [27]:
# Define table where embeddings will be written and columns to be embedded
import time
import vertexai

target_table_name = table_name
columns_to_embed = ['analysis','overview']

# Define global variables to track progress and estimate cost
global index_pointer
global batch_count
global batch_char_count
global total_char_count

# Define batch variables
batch_array = None
batch_size = None
batch_count = 0
total_char_count = 0
index_pointer = 0

column_to_embed = 'analysis'
vertexai.init(project=project_id)

source_array = get_source_data()

# Keep track of job timing
start_time = time.time()

for column_to_embed in columns_to_embed:
  # Initialize the index pointer for batch processing
  index_pointer = 0

  print(f"Creating embeddings for column {column_to_embed}...")
  results = embed_objects(source_array, column_to_embed)

  print(f"Creating temp table to store intermediate results...")
  temp_table_name = create_temp_table(column_to_embed)

  print(f"Inserting embeddings into temp table: {temp_table_name}...")
  insert_to_temp_table(temp_table_name, column_to_embed, results)

  print(f"Merging temp table {temp_table_name} with target table {target_table_name}...")
  update_target_table(temp_table_name, target_table_name, column_to_embed)

  print(f"Dropping temp table temp_table_name...")
  drop_temp_table(temp_table_name)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Job started at: {time.ctime(start_time)}")
print(f"Job ended at: {time.ctime(end_time)}")
print(f"Total run time: {elapsed_time:.2f} seconds")