In [1]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Baseline: https://github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/intro-textemb-vectorsearch.ipynb

# Getting Started with Text Embeddings + Vertex AI Vector Search

#### Install Python SDK

In [None]:
%pip install --upgrade --user google-cloud-aiplatform google-cloud-storage 'google-cloud-bigquery[pandas]'

## Restart current runtime

In [5]:
# Restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

## Environment variables

In [1]:
###### get project ID
PROJECT_ID = ! gcloud config get project
PROJECT_ID = PROJECT_ID[0]
LOCATION = "us-central1"
if PROJECT_ID == "(unset)":
    print(f"Please set the project ID manually below")

In [2]:
###### define project information
if PROJECT_ID == "(unset)":
    PROJECT_ID = "jc-gcp-project"  # @param {type:"string"}

# generate an unique id for this session
from datetime import datetime

UID = datetime.now().strftime("%m%d%H%M")

## Enable APIs

In [None]:
! gcloud services enable compute.googleapis.com aiplatform.googleapis.com storage.googleapis.com bigquery.googleapis.com --project {PROJECT_ID}

## Data Preparation

In [3]:
# load the BQ Table into a Pandas DataFrame
from google.cloud import bigquery

QUESTIONS_SIZE = 1000

bq_client = bigquery.Client(project=PROJECT_ID)
QUERY_TEMPLATE = """
        select * from `jc-gcp-project.movielens.sample_movies` ;
        """
query = QUERY_TEMPLATE.format(limit=QUESTIONS_SIZE)
query_job = bq_client.query(query)
rows = query_job.result()
df = rows.to_dataframe()

# examine the data
df.head()

Unnamed: 0,id,title,popularity,vote_average,overview,genre
0,371085,Sharkskin,0.027801,0.0,The Post War II story of Manhattan born Mike E...,
1,198370,Mutual Friends,0.136721,0.0,Surprise parties rarely go well. This one is n...,
2,38786,The Blood of My Brother: A Story of Death in Iraq,0.005256,0.0,THE BLOOD OF MY BROTHER goes behind the scenes...,
3,219716,Sparkler,0.547654,0.0,Melba is a Californian trailer-park girl who i...,
4,331493,Light from the Darkroom,0.012942,0.0,Light in the Darkroom is the story of two best...,


In [4]:
len(df)

4803

In [None]:
# Remove NA
df['id'] = df['id'].fillna('')
df['title'] = df['title'].fillna('')
df['popularity'] = df['popularity'].fillna('')
df['vote_average'] = df['vote_average'].fillna('')
df['overview'] = df['overview'].fillna('')
df['genre'] = df['genre'].fillna('')

In [None]:
df[df['title'] == 'Avatar']

Unnamed: 0,id,title,popularity,vote_average,overview,genre
2228,19995,Avatar,150.437577,7.2,"In the 22nd century, a paraplegic Marine is di...",Action


## Call the API to generate embeddings

In [5]:
###### init the vertexai package
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [9]:
###### Load the text embeddings model
from vertexai.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("textembedding-gecko-multilingual@001")

## API Usage Quota

In [7]:
###### Functions

import time

import tqdm  # to show a progress bar

# get embeddings for a list of texts
BATCH_SIZE = 5


def get_embeddings_wrapper(texts):
    embs = []
    for i in tqdm.tqdm(range(0, len(texts), BATCH_SIZE)):
        time.sleep(1)  # to avoid the quota error
        result = model.get_embeddings(texts[i : i + BATCH_SIZE])
        embs = embs + [e.values for e in result]
    return embs

In [None]:
df['text'] = '"title" : ' + df['title'].astype("string") + ', "popularity" : ' + df['popularity'].astype("string") + ', "vote_average"' + df['vote_average'].astype("string") + ', "genre" : ' + df['genre'].astype("string") + ', "overview" : ' + df['overview'].astype("string")

In [None]:
len(df.text)

4803

In [None]:
# get embeddings for the question titles and add them as "embedding" column
df = df.assign(embedding=get_embeddings_wrapper(list(df.text)))
df.head()

100%|██████████| 961/961 [17:29<00:00,  1.09s/it]


Unnamed: 0,id,title,popularity,vote_average,overview,genre,text,embedding
0,371085,Sharkskin,0.027801,0.0,The Post War II story of Manhattan born Mike E...,,"""title"" : Sharkskin, ""popularity"" : 0.027801, ...","[0.015336157754063606, -0.038394711911678314, ..."
1,198370,Mutual Friends,0.136721,0.0,Surprise parties rarely go well. This one is n...,,"""title"" : Mutual Friends, ""popularity"" : 0.136...","[0.009683111682534218, -0.04617595672607422, 0..."
2,38786,The Blood of My Brother: A Story of Death in Iraq,0.005256,0.0,THE BLOOD OF MY BROTHER goes behind the scenes...,,"""title"" : The Blood of My Brother: A Story of ...","[-0.029223162680864334, -0.033517830073833466,..."
3,219716,Sparkler,0.547654,0.0,Melba is a Californian trailer-park girl who i...,,"""title"" : Sparkler, ""popularity"" : 0.547654, ""...","[0.04448612406849861, -0.0565938837826252, 0.0..."
4,331493,Light from the Darkroom,0.012942,0.0,Light in the Darkroom is the story of two best...,,"""title"" : Light from the Darkroom, ""popularity...","[-0.02965383604168892, -0.05053010955452919, 0..."


In [None]:
# df[df['title'] == 'Avatar'].embedding.iloc[0] # avartar 임베딩
df[df['title'] == 'Avatar']

Unnamed: 0,id,title,popularity,vote_average,overview,genre,text,embedding
2228,19995,Avatar,150.437577,7.2,"In the 22nd century, a paraplegic Marine is di...",Action,"""title"" : Avatar, ""popularity"" : 150.437577, ""...","[0.018325019627809525, -0.054211799055337906, ..."


In [None]:
## 아바타 샘플 임베딩 값 확인
# df[df['title'] == 'Avatar'].embedding.iloc[0]

## Get Randome one & Check similarity w/ Dot

In [None]:
import random

import numpy as np

# pick one of them as a key question
# key = random.randint(0, len(df))
key = 2228
# calc dot product between the key and other questions
embs = np.array(df.embedding.to_list())
similarities = np.dot(embs[key], embs.T)

# print similarities for the first 5 questions
similarities[:5]

array([0.60934513, 0.58459621, 0.6016171 , 0.64416181, 0.58156595])

## Print the list

In [None]:
# print the question
print(f"Key question: {df.title[key]}\n")

# sort and print the questions by similarities
sorted_questions = sorted(
    zip(df.title, similarities), key=lambda x: x[1], reverse=True
)[:20]
for i, (question, similarity) in enumerate(sorted_questions):
    print(f"{similarity:.4f} {question}")

Key question: Avatar

1.0000 Avatar
0.7709 The Last Airbender
0.7398 Titan A.E.
0.7379 The Inhabited Island
0.7375 John Carter
0.7319 Pandorum
0.7306 Planet of the Apes
0.7279 Conquest of the Planet of the Apes
0.7179 Beneath the Planet of the Apes
0.7177 Avengers: Age of Ultron
0.7175 Escape from Planet Earth
0.7166 Oblivion
0.7160 Apocalypto
0.7153 Lost in Space
0.7150 Captain America: Civil War
0.7149 The Martian
0.7148 The Matrix
0.7134 Galaxina
0.7120 Cargo
0.7110 Mission to Mars


## Save the embeddings in a JSON file

In [None]:
# save id and embedding as a json file
jsonl_string = df[["id", "embedding"]].to_json(orient="records", lines=True)
with open("questions.json", "w") as f:
    f.write(jsonl_string)

# show the first few lines of the json file
! head -n 3 questions.json

{"id":371085,"embedding":[0.0153361578,-0.0383947119,0.0352433771,0.0147550041,-0.0319922231,-0.0459211282,-0.0205535181,-0.041891966,0.0089921989,0.0284238458,0.014186698,0.0300955791,0.0258459207,-0.0378358439,0.0237927306,0.0335237756,0.0195060074,0.0082418136,-0.0965101644,0.019350538,-0.0078775575,-0.0119496165,-0.0274844002,0.0447204672,0.0143402014,-0.0388614349,-0.0185875259,-0.0016751911,-0.0199454874,-0.0016352511,0.0079187471,-0.0133166835,0.0349910371,0.0353308655,0.040520031,0.0601883009,0.0099755321,-0.0040698647,0.0197575428,0.0556138679,0.056258142,-0.0870138109,-0.0106943715,0.0019231349,0.0433021486,-0.0126744583,-0.0298540033,0.0327363946,0.0133010373,0.0056282762,0.0609988943,-0.0264207441,0.0384953767,0.0040295478,0.0478877015,0.0103862984,-0.0204253197,0.0240118429,-0.0155596314,0.0039435392,0.0222576689,-0.055673115,0.0395441763,0.0292392205,-0.0050922697,0.032362625,-0.0595636182,0.0457811058,0.005710192,0.0020742365,0.0170880556,0.0225609373,-0.0893099606,0.028

## Save to Bucket

In [None]:
BUCKET_URI = f"gs://{PROJECT_ID}-movie_embedding-{UID}"
! gsutil mb -l $LOCATION -p {PROJECT_ID} {BUCKET_URI}
! gsutil cp questions.json {BUCKET_URI}

Creating gs://jc-gcp-project-movie_embedding-12010905/...
Copying file://questions.json [Content-Type=application/json]...
- [1 files][ 47.2 MiB/ 47.2 MiB]                                                
Operation completed over 1 objects/47.2 MiB.                                     


## Create an Index

In [13]:
###### init the aiplatform package
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [None]:
# create index
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=f"embvs-movie-index-{UID}",
    contents_delta_uri=BUCKET_URI,
    dimensions=768,
    approximate_neighbors_count=20,
    distance_measure_type="DOT_PRODUCT_DISTANCE",
)

INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:Creating MatchingEngineIndex
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:Create MatchingEngineIndex backing LRO: projects/78565683329/locations/us-central1/indexes/7527375351010295808/operations/2693361931053432832
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:MatchingEngineIndex created. Resource name: projects/78565683329/locations/us-central1/indexes/7527375351010295808
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:To use this MatchingEngineIndex in another session:
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:index = aiplatform.MatchingEngineIndex('projects/78565683329/locations/us-central1/indexes/7527375351010295808')


### **The parameters for creating index**
#### **contents_delta_uri**: The URI of Cloud Storage directory where you stored the embedding JSON files
#### **dimensions**: Dimension size of each embedding. In this case, it is 768 as we are using the embeddings from the Text Embeddings API.
#### **approximate_neighbors_count**: how many similar items we want to retrieve in typical cases
#### **distance_measure_type**: what metrics to measure distance/similarity between embeddings. In this case it's DOT_PRODUCT_DISTANCE

## Create Index Endpoint and deploy the Index

In [None]:
# create IndexEndpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"movie_indexing_endpoint-{UID}",
    public_endpoint_enabled=True,
)

INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Creating MatchingEngineIndexEndpoint
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Create MatchingEngineIndexEndpoint backing LRO: projects/78565683329/locations/us-central1/indexEndpoints/664311731362725888/operations/6935893517524795392
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:MatchingEngineIndexEndpoint created. Resource name: projects/78565683329/locations/us-central1/indexEndpoints/664311731362725888
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:To use this MatchingEngineIndexEndpoint in another session:
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/78565683329/locations/us-central1/indexEndpoints/664311731362725888')


In [None]:
DEPLOYED_INDEX_ID = f"movie_indexing_test_deployed_2"

In [None]:
# deploy the Index to the Index Endpoint
my_index_endpoint.deploy_index(index=my_index, deployed_index_id=DEPLOYED_INDEX_ID)

INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/78565683329/locations/us-central1/indexEndpoints/664311731362725888
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/78565683329/locations/us-central1/indexEndpoints/664311731362725888/operations/3585778344714567680
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/78565683329/locations/us-central1/indexEndpoints/664311731362725888


<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x7e2f526b4550> 
resource name: projects/78565683329/locations/us-central1/indexEndpoints/664311731362725888

## Run Query

In [17]:
###### Call Endpoint
from google.cloud import aiplatform_v1

# Set variables for the current deployed index.
API_ENDPOINT="1051074875.us-central1-78565683329.vdb.vertexai.goog"
INDEX_ENDPOINT="projects/78565683329/locations/us-central1/indexEndpoints/664311731362725888"
DEPLOYED_INDEX_ID="movie_indexing_test_deployed_2"

In [41]:
###### Call Endpoint
PROJECT_ID = "jc-gcp-project"
REGION = "us-central1"
ENDPOINT_ID = "664311731362725888"

In [42]:
###### Call Endpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint(
    index_endpoint_name=f"projects/{PROJECT_ID}/locations/{REGION}/indexEndpoints/{ENDPOINT_ID}"
)

In [51]:
###### Test
query = '아바타와 비슷한 SF느낌의 영화'
test_embeddings = get_embeddings_wrapper([query])

100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


In [52]:
###### Test query
response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=test_embeddings,
    num_neighbors=20,
)

# show the result
import numpy as np

for idx, neighbor in enumerate(response[0]):
    id = np.int64(neighbor.id)
    similar = df.query("id == @id", engine="python")
    print(f"{neighbor.distance:.4f} {similar.title.values[0]}")

0.6455 Gattaca
0.6343 An American in Hollywood
0.6319 Renaissance
0.6318 Avatar
0.6314 The Astronaut's Wife
0.6310 The Lost Skeleton of Cadavra
0.6308 Bad Company
0.6303 Metropolis
0.6300 Surrogates
0.6295 Cypher
0.6275 Akira
0.6240 Babylon A.D.
0.6232 Copycat
0.6228 The Aviator
0.6228 Beyond Borders
0.6228 Another Earth
0.6220 Lost in Translation
0.6218 Kabhi Alvida Naa Kehna
0.6217 Amidst the Devil's Wings
0.6212 Abandon


In [48]:
# @title Agent initialization.
######

from IPython.display import display, Markdown
from vertexai.preview.generative_models import grounding

from vertexai.generative_models import (
    GenerationConfig,
    GenerativeModel,
    HarmBlockThreshold,
    HarmCategory,
    Part,
    Tool
)

MODEL_NAME="gemini-1.5-flash"
PROJECT_ID="jc-gcp-project"
REGION="us-central1"

vertexai.init(project=PROJECT_ID, location=REGION)
model_1 = GenerativeModel(MODEL_NAME)

In [49]:
# @title Call Gemini
######
def give_to_gemini(query):
  prompt =f"""
          당신은 영화정보를 제공하는AI 어시스턴트 입니다. 답변은 아래 내용을 따라서 답해주세요.

          1. 영화 2~3개와 이유를 2줄 이내로 먼저 답변 해주세요.
          2. 영화를 선정 하는 이유를 설명 해주세요.
          3. 전체적인 답변은 간략하게 최대 10줄을 넘지 않게 해주세요.

          <질문> : {query}
  """
  tool = Tool.from_google_search_retrieval(grounding.GoogleSearchRetrieval())

  responses = model_1.generate_content(
      [prompt],
      tools=[tool],

  )
  return responses.text

In [53]:
for idx, neighbor in enumerate(response[0]):
    id = np.int64(neighbor.id)
    similar = df.query("id == @id", engine="python")
    print(f"{neighbor.distance:.4f} {similar.title.values[0]}")
    if neighbor.distance < 0.7:
        print("stop and give to Gemini")
        print("")
        ans = give_to_gemini(query)
        print(ans)
        break

0.6455 Gattaca
stop and give to Gemini

아바타와 비슷한 SF 느낌의 영화로는 **'월E'**와 **'인터스텔라'**를 추천합니다. 

'월E'는 환경 오염으로 황폐해진 지구를 떠난 인류가 새로운 행성을 찾아 떠나는 이야기로, 아바타와 마찬가지로 **인간과 자연의 관계, 환경 문제를 다루는 동시에 따뜻한 감동을 선사**합니다. '인터스텔라'는 멸망 위기에 처한 지구를 구하기 위해 우주로 떠나는 사람들의 이야기로, **웅장한 우주를 배경으로 인간의 희생과 숭고함, 가족애를 그려낸 작품**입니다. 

두 영화 모두 아바타처럼 뛰어난 영상미를 자랑하며, 인간의 미래와 존재에 대한 깊은 메시지를 담고 있어 아바타와 비슷한 SF적 경험을 제공할 것입니다. 



## Clean Up

In [None]:
# # wait for a confirmation
# input("Press Enter to delete Index Endpoint, Index and Cloud Storage bucket:")

# # delete Index Endpoint
# my_index_endpoint.undeploy_all()
# my_index_endpoint.delete(force=True)

# # delete Index
# my_index.delete()

# # delete Cloud Storage bucket
# ! gsutil rm -r {BUCKET_URI}

Structured search, Vector search

Vertex AI Search (콜수) - 빅쿼리에 데이터 넣고, 인덱싱해서 따로 만듦
임베딩할 필요없음
BQ, CloudSQL