Simplified version of the tutorial notebook [here](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/intro-textemb-vectorsearch.ipynb).

# Install

In [None]:
!pip install --upgrade --user -q google-cloud-aiplatform google-cloud-storage

In [None]:
# Restart kernel after installs so that your environment can access the new packages
import IPython
import time

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [None]:
from google.colab import auth

auth.authenticate_user()

In [None]:
PROJECT_ID = "rag-demo-feb-24"
LOCATION = "us-central1"

In [None]:
import vertexai
from google.cloud import aiplatform

vertexai.init(project=PROJECT_ID, location=LOCATION)
aiplatform.init(project=PROJECT_ID, location=LOCATION)

# Prepare Embeddings

Load dataset from BigQuery

In [None]:
import pandas as pd
from google.cloud import bigquery

QUESTIONS_SIZE = 1000

rows = bigquery.Client(project=PROJECT_ID).query("""
    SELECT distinct q.id, q.title
    FROM (SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions`
    where Score > 0 ORDER BY View_Count desc) AS q
    LIMIT 1000 ;
    """).result()
df = rows.to_dataframe()
df.head()

Unnamed: 0,id,title
0,73360276,I cannot access nor edit an Azure CDN endpoint...
1,73523815,MoviePy RuntimeError: imageio.ffmpeg.download(...
2,73415054,Three.js: moving object instances from one pla...
3,73381577,How do I store time using the Ebuka Rufus Onuc...
4,73383656,How to make a navbar button which redirects to...


Generate embeddings

In [None]:
from vertexai.preview.language_models import TextEmbeddingModel
import time
import tqdm

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

def get_embeddings_wrapper(texts):
    embs = []
    for i in tqdm.tqdm(range(0, len(texts), 5)):
        time.sleep(1)
        result = model.get_embeddings(texts[i : i + 5])
        embs = embs + [e.values for e in result]
    return embs

df["embedding"] = get_embeddings_wrapper(df.title.tolist())
df.head()

100%|██████████| 200/200 [03:47<00:00,  1.14s/it]


Unnamed: 0,id,title,embedding
0,73360276,I cannot access nor edit an Azure CDN endpoint...,"[-0.036696285009384155, -0.021567516028881073,..."
1,73523815,MoviePy RuntimeError: imageio.ffmpeg.download(...,"[-0.02327927201986313, 0.007265365682542324, 0..."
2,73415054,Three.js: moving object instances from one pla...,"[-0.04465967416763306, -0.020884497091174126, ..."
3,73381577,How do I store time using the Ebuka Rufus Onuc...,"[-0.012737913057208061, 0.01997678354382515, -..."
4,73383656,How to make a navbar button which redirects to...,"[-0.0270612183958292, -0.0004190326726529747, ..."


Export the JSONL file with the embeddings to GCS

In [None]:
jsonl_string = df[["id", "embedding"]].to_json(orient="records", lines=True)
with open("questions.json", "w") as f:
    f.write(jsonl_string)

! head -n 3 questions.json

{"id":73360276,"embedding":[-0.036696285,-0.021567516,-0.0048369383,-0.0111121172,0.0486352704,0.0207663011,0.0702214092,0.0332371667,0.0409793034,0.0182338357,-0.0322109386,0.0696232915,0.0179792158,0.0127161033,-0.0221366733,-0.03438434,-0.0078734346,0.0027871784,0.0228261538,0.0006418196,-0.0503015593,-0.0095348489,-0.0412313975,-0.0150600672,0.0189070031,-0.0714152232,0.035199333,0.0185486563,-0.0018092254,-0.0332042202,-0.0528355762,0.0363924839,0.0078548677,-0.0206273291,-0.0141126802,-0.010203016,0.0016737519,0.0267850515,0.0288533028,0.0301698241,0.0069376589,-0.015088792,0.0246937238,0.0320671275,0.0240819771,-0.029676944,-0.0278358944,0.0357254818,-0.0202751737,-0.0492245555,-0.028363362,-0.001164688,-0.0031672786,0.0099621732,0.0062768422,0.0451618396,-0.0172826983,0.0013769255,-0.0468924604,0.0225442462,-0.0181346759,0.0065850066,0.0346495435,-0.0349658839,0.0096246256,0.0328723527,0.0254836939,0.0214202125,-0.0216856003,-0.0475550219,0.0425472967,0.0129868183,-0.016450014,

In [None]:
! gsutil cp "questions.json" "gs://rag-demo-us/vector_search/"

Copying file://questions.json [Content-Type=application/json]...
- [1 files][  9.8 MiB/  9.8 MiB]                                                
Operation completed over 1 objects/9.8 MiB.                                      


# Create an index and deploy it through Google Cloud Vertex AI Vector Search console

# Query

In [None]:
test_embeddings = get_embeddings_wrapper(["How to read JSON with Python?"])
test_embeddings

100%|██████████| 1/1 [00:01<00:00,  1.26s/it]


[[-0.009405541233718395,
  -0.012011843733489513,
  0.03272425755858421,
  -0.017712930217385292,
  0.024488620460033417,
  -0.0064089735969901085,
  0.027680082246661186,
  0.01308376993983984,
  -0.028244134038686752,
  0.01817595772445202,
  0.025589874014258385,
  0.038305673748254776,
  0.03444906696677208,
  -0.024641847237944603,
  -0.020275678485631943,
  0.0040742806158959866,
  -0.04790573567152023,
  -0.04560501500964165,
  0.0008088672766461968,
  0.026284685358405113,
  -0.04548338055610657,
  -0.019667573273181915,
  0.005759553983807564,
  0.009407688863575459,
  -0.031886711716651917,
  -0.10478324443101883,
  0.030070681124925613,
  0.013050236739218235,
  -0.007181359920650721,
  -0.03439131751656532,
  -0.04584177955985069,
  0.03567810729146004,
  -0.025517383590340614,
  -0.04891308397054672,
  0.021599987521767616,
  0.04497474059462547,
  0.04183082655072212,
  0.008171860128641129,
  0.002165100071579218,
  -0.0056815375573933125,
  0.013008587062358856,
  -0.00

In [None]:
endpoint = aiplatform.MatchingEngineIndexEndpoint(index_endpoint_name="projects/709935237243/locations/us-central1/indexEndpoints/9160431593234366464")
endpoint

<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x7fbcd75bba60> 
resource name: projects/709935237243/locations/us-central1/indexEndpoints/9160431593234366464

In [None]:
response = endpoint.find_neighbors(
    deployed_index_id="rag_demo_index_1707620378200",
    queries=test_embeddings,
    num_neighbors=20,
)

import numpy as np

for idx, neighbor in enumerate(response[0]):
    id = np.int64(neighbor.id)
    similar = df.query("id == @id", engine="python")
    print(f"{neighbor.distance:.4f} {similar.title.values[0]}")

0.7999 How to handle missing JSON nested keys from an API response in python?
0.7342 Unexpected end of JSON input when JSON is valid
0.7142 How to create a list from Pandas Series?
0.7100 Extract data from array - Python
0.6947 Get data for class from JSON - Flutter
0.6944 How can I deserialize row to DTO?
0.6914 pandas df of api query
0.6891 How to split a python list based on specific characters like spaces and forward slashes?
0.6875 How to scrape list of titles from a webpage?
0.6749 How to call a view from within itself in a Django REST API?
0.6736 Why I cant decode AES-CTR in python
0.6734 python 3 - how to split a key in a dictionary in 2
0.6716 Wikitable scrapping using python
0.6636 Connecting frontend and backend codes in PYTHON only
0.6635 How to display live logs from file in flask?
0.6583 Reading 409 response json in production cannot read custom error message from server
0.6571 How do you fetch content where the object contains multiple levels in vue?
0.6546 How to add ca