In [1]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Objective

Given a list of embeddings, create and deploy a Vertex AI Vector Search (fka Matching Engine) index.

**Input**
- A BigQuery table containing image and text embeddings. This was covered in 1_generate_embeddings.ipynb

**Output**
- A vector DB that enables low latency nearest neighbor searching in embeddings space

# Setup

### Install Dependencies (If Needed)

The list `packages` contains tuples of package import names and install names. If the import name is not found then the install name is used to install quitely for the current user.

In [2]:
# tuples of (import name, install name)
packages = [
    ('google.cloud.aiplatform', 'google-cloud-aiplatform'),
]

import importlib
install = False
for package in packages:
    if not importlib.util.find_spec(package[0]):
        print(f'installing package {package[1]}')
        install = True
        !pip install {package[1]} -U -q --user

### Restart Kernel (If Installs Occured)

After a kernel restart the code submission can start with the next cell after this one.

In [3]:
if install:
    import IPython
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

### Authenticate

If you are using Colab, you will need to authenticate yourself first. The next cell will check if you are currently using Colab, and will start the authentication process.

In [4]:
import sys

if 'google.colab' in sys.modules:
    from google.colab import auth as google_auth
    google_auth.authenticate_user()

### Config

In [9]:
PROJECT_ID = 'solutions-2023-mar-107' # @param {type:"string"}
REGION = 'us-central1' # @param {type:"string"}
TEST_DESCRIPTION = "Key Features of Vishudh Printed Women's Straight Kurta BLACK, GREY Straight,Specifications of Vishudh Printed Women's Straight Kurta Kurta Details Sleeve Sleeveless Number of Contents in Sales Package Pack of 1 Fabric 100% POLYESTER Type Straight Neck ROUND NECK General Details Pattern Printed Occasion Festive Ideal For Women's In the Box Kurta Additional Details Style Code VNKU004374 BLACK::GREY Fabric Care Gentle Machine Wash in Lukewarm Water, Do Not Bleach" # @param {type:"string"}
TEST_IMAGE = 'gs://genai-product-catalog/flipkart_20k_oct26/3ecb859759e5311cbab6850e98879522_0.jpg' # @param {type:"string"}

In [None]:
from google.cloud import aiplatform
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

# Store Embeddings in GCS

Vertex AI Vector Search does not currently support direct import from BigQuery. We need to store in GCS in a supported format. 

To do so
1. Run the below query directly in GCP cloud console
2. After running query, export as JSONL, this will prompt you to choose a google drive location to save it
3. Copy from google drive to a GCS bucket
4. Update the 'BUCKET_URI'  variable below to the location of the folder containing the embeddings

```

SELECT
 CONCAT(id,'_T') as id,
 text_embedding as embedding,
 c0_name as L0,
 c1_name as L1,
 c2_name as L2,
 c3_name as L3

FROM
 `<PROJECT_ID>.<DATASET_ID>.<TABLE_ID>`

UNION ALL

SELECT
 CONCAT(id,'_I') as embedding,
 image_embedding,
 c0_name as L0,
 c1_name as L1,
 c2_name as L2,
 c3_name as L3

FROM
 `<PROJECT_ID>.<DATASET_ID>.<TABLE_ID>`;
 ```

In [6]:
BUCKET_URI = "gs://vector_search_regional/flipkart_multimodal_embeddings" # @param {type:"string"} # WHERE EMBEDDINGS ARE STORED

# Create Index

In [None]:
tree_ah_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name='flipkart_multimodal_batch_tree_cosine',
    contents_delta_uri=BUCKET_URI,
    dimensions=1408,
    approximate_neighbors_count=150,
    distance_measure_type="COSINE_DISTANCE",
    leaf_node_embedding_count=500,
    leaf_nodes_to_search_percent=7,
    description='Based on ~18K Flipkart product listings for which we have both a description and image',
)

In [None]:
INDEX_RESOURCE_NAME = tree_ah_index.resource_name
print(INDEX_RESOURCE_NAME)

In [None]:
tree_ah_index = aiplatform.MatchingEngineIndex(index_name=INDEX_RESOURCE_NAME)

# Deploy Index

In [None]:
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name='flipkart_batch',
    description='Endpoint on flipkart',
    public_endpoint_enabled=True,
)

In [None]:
DEPLOYED_INDEX_ID = 'flipkart_muiltimodal_18K'
my_index_endpoint = my_index_endpoint.deploy_index(
    index=tree_ah_index, deployed_index_id=DEPLOYED_INDEX_ID
)
my_index_endpoint.deployed_indexes

# Query Index

Instead of re-implementing the embedding API code from the previous notebook, we will load the local embeddings module

In [7]:
import sys  
sys.path.insert(0, '../backend/')
import embeddings # Local Project Code

In [13]:
res = embeddings.embed(TEST_DESCRIPTION,TEST_IMAGE,project=PROJECT_ID)
print(res.text_embedding[:5])
print(res.image_embedding[:5])

[-0.0165299699, -0.0692435354, 0.0147973141, 0.0349166244, 0.00536287716]
[-0.00627786433, 0.0557938665, -0.0300552044, 0.0268275458, 0.0392337069]


In [None]:
NUM_NEIGHBORS = 5

response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=[res.text_embedding,res.image_embedding],
    num_neighbors=NUM_NEIGHBORS,
)

response