In [1]:
!pip install --upgrade --user google-cloud-aiplatform google-cloud-storage 'google-cloud-bigquery[pandas]'



In [2]:
# Restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [1]:
# get project ID
PROJECT_ID = ! gcloud config get project
PROJECT_ID = PROJECT_ID[0]
LOCATION = "us-central1"
if PROJECT_ID == "(unset)":
    print(f"Please set the project ID manually below")

In [2]:
# define project information
if PROJECT_ID == "(unset)":
    PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# generate an unique id for this session
from datetime import datetime

UID = datetime.now().strftime("%m%d%H%M")

In [3]:
! gcloud services enable compute.googleapis.com aiplatform.googleapis.com storage.googleapis.com bigquery.googleapis.com --project {PROJECT_ID}


Operation "operations/acat.p2-900493032577-5431ece7-4303-4a92-938c-4b53e41b7f25" finished successfully.


In [6]:
import pandas as pd
df=pd.read_csv("./orva_data.csv")
df['title'] = df['question'] + " " + df['title']
df = df.drop(['question'], axis=1)
df

Unnamed: 0.1,Unnamed: 0,id,title
0,0,1,When can I start driving again? Driving after ...
1,1,2,When can I drive? Driving after a total knee r...
2,2,3,When will I be able to drive again? Driving af...
3,3,4,My knee still hurts but am I able to drive? Dr...
4,4,5,How long after surgery can I drive? Driving af...
...,...,...,...
711,711,712,I need to enter my Range of Motion Range of M...
712,712,713,Enter Range of Motion Range of Motion is show...
713,713,714,How should I ask you questions to get good ans...
714,714,715,How should I format my questions Thank you for...


In [7]:
# init the vertexai package
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [8]:
# Load the text embeddings model
from vertexai.preview.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

In [9]:
import time
import tqdm  # to show a progress bar

# get embeddings for a list of texts
BATCH_SIZE = 5


def get_embeddings_wrapper(texts):
    embs = []
    for i in tqdm.tqdm(range(0, len(texts), BATCH_SIZE)):
        time.sleep(1)  # to avoid the quota error
        result = model.get_embeddings(texts[i : i + BATCH_SIZE])
        embs = embs + [e.values for e in result]
    return embs

In [10]:
# get embeddings for the question titles and add them as "embedding" column
df = df.assign(embedding=get_embeddings_wrapper(list(df.title)))
df.head()

100%|██████████| 144/144 [02:39<00:00,  1.10s/it]


Unnamed: 0,id,title,embedding
0,1,When can I start driving again? Driving after ...,"[-0.0066839782521128654, -0.03569670766592026,..."
1,2,When can I drive? Driving after a total knee r...,"[-0.008440850302577019, -0.04061223939061165, ..."
2,3,When will I be able to drive again? Driving af...,"[-0.009732662700116634, -0.03572031483054161, ..."
3,4,My knee still hurts but am I able to drive? Dr...,"[-0.020311227068305016, -0.02924209088087082, ..."
4,5,How long after surgery can I drive? Driving af...,"[-0.010605815798044205, -0.036168962717056274,..."


In [11]:
import random
import numpy as np

# pick one of them as a key question
key = random.randint(0, len(df))

# calc dot product between the key and other questions
embs = np.array(df.embedding.to_list())
similarities = np.dot(embs[key], embs.T)

# print similarities for the first 5 questions
similarities[:5]

array([0.72825709, 0.72765494, 0.72793133, 0.72346979, 0.71986001])

In [12]:
# print the question
print(f"Key question: {df.title[key]}\n")

# sort and print the questions by similarities
sorted_questions = sorted(
    zip(df.title, similarities), key=lambda x: x[1], reverse=True
)[:20]
for i, (question, similarity) in enumerate(sorted_questions):
    print(f"{similarity:.4f} {question}")

Key question: Is ROM important  Maintaining and gradually improving your range of motion (ROM) is crucial for a successful recovery after total knee arthroplasty. Physical therapy exercises, stretching, and controlled movements help prevent stiffness and increase flexibility in your knee joint. Adequate range of motion enables you to perform daily activities more comfortably and reduces the risk of complications such as contractures or muscle weakness.

1.0000 Is ROM important  Maintaining and gradually improving your range of motion (ROM) is crucial for a successful recovery after total knee arthroplasty. Physical therapy exercises, stretching, and controlled movements help prevent stiffness and increase flexibility in your knee joint. Adequate range of motion enables you to perform daily activities more comfortably and reduces the risk of complications such as contractures or muscle weakness.
1.0000 Is ROM important  Maintaining and gradually improving your range of motion (ROM) is c

In [13]:
# save id and embedding as a json file
jsonl_string = df[["id", "embedding"]].to_json(orient="records", lines=True)
with open("questions.json", "w") as f:
    f.write(jsonl_string)

# show the first few lines of the json file
! head -n 3 questions.json

{"id":1,"embedding":[-0.0066839783,-0.0356967077,0.0189061519,0.0149366483,-0.0205059797,-0.0159948431,0.0200969595,0.0423078164,0.0014735546,0.0186544321,0.0136704836,0.0168111566,0.033799503,-0.0034503969,-0.0104913311,-0.0199503247,-0.0636559352,-0.0468531772,0.0523984879,-0.0064493623,-0.0887796432,-0.0275822319,-0.0117647722,-0.0009986137,0.0027872189,-0.0704310164,0.000181809,-0.0529087223,-0.0483736135,0.0387966558,-0.0078149214,-0.0045516184,-0.0194039103,0.0181365721,-0.022202231,-0.0149719398,-0.0471177697,0.0197852515,-0.0233722944,0.0383752249,-0.0098919161,0.0043594711,-0.0095052021,-0.0009564502,0.026363384,0.0249010473,-0.0325681604,0.0295027699,0.0251958333,-0.004872968,-0.0335314572,-0.0117109809,0.0259424336,0.0264375582,0.0624857396,0.0126298266,-0.0506161079,-0.0085757431,-0.0283466671,0.0124333901,0.020052202,-0.0073413313,-0.0109457122,-0.0463712327,-0.0051849969,0.0108801192,0.0108611826,-0.0406404957,0.0041380697,0.0005310983,0.0593835674,0.0263276212,-0.0334484

In [14]:
BUCKET_URI = f"gs://{PROJECT_ID}-orva-{UID}"
! gsutil mb -l $LOCATION -p {PROJECT_ID} {BUCKET_URI}
! gsutil cp questions.json {BUCKET_URI}

Creating gs://qdmeds-orva-05030849/...
Copying file://questions.json [Content-Type=application/json]...
/ [1 files][  7.0 MiB/  7.0 MiB]                                                
Operation completed over 1 objects/7.0 MiB.                                      


In [15]:
# init the aiplatform package
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [16]:
# create index
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=f"orva-poc-index-{UID}",
    contents_delta_uri=BUCKET_URI,
    dimensions=768,
    approximate_neighbors_count=20,
    distance_measure_type="DOT_PRODUCT_DISTANCE",
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/900493032577/locations/us-central1/indexes/2613833808339795968/operations/4457852831189172224
MatchingEngineIndex created. Resource name: projects/900493032577/locations/us-central1/indexes/2613833808339795968
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/900493032577/locations/us-central1/indexes/2613833808339795968')


In [17]:
# create IndexEndpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"orva-poc-index-endpoint-{UID}",
    public_endpoint_enabled=True,
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/900493032577/locations/us-central1/indexEndpoints/3989120544548061184/operations/5421904626423169024
MatchingEngineIndexEndpoint created. Resource name: projects/900493032577/locations/us-central1/indexEndpoints/3989120544548061184
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/900493032577/locations/us-central1/indexEndpoints/3989120544548061184')


In [20]:
DEPLOYED_INDEX_ID = f"orva_poc_index_deployed_{UID}"

In [21]:
# deploy the Index to the Index Endpoint
my_index_endpoint.deploy_index(index=my_index, deployed_index_id=DEPLOYED_INDEX_ID)

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/900493032577/locations/us-central1/indexEndpoints/3989120544548061184
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/900493032577/locations/us-central1/indexEndpoints/3989120544548061184/operations/9069820324593270784
MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/900493032577/locations/us-central1/indexEndpoints/3989120544548061184


<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x7f0d34246ad0> 
resource name: projects/900493032577/locations/us-central1/indexEndpoints/3989120544548061184

In [22]:
test_embeddings = get_embeddings_wrapper(["i want to bike?"])

100%|██████████| 1/1 [00:01<00:00,  1.11s/it]


In [23]:
# Test query
response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=test_embeddings,
    num_neighbors=20,
)

# show the result
import numpy as np

for idx, neighbor in enumerate(response[0]):
    id = np.int64(neighbor.id)
    similar = df.query("id == @id", engine="python")
    print(f"{neighbor.distance:.4f} {similar.title.values[0]}")

0.6846 can I bike Stationary cycling could be introduced as early as 2 to 3 months post-surgery, offering a safe way to build strength without putting too much strain on your knee. Transitioning to outdoor biking on flat surfaces might be possible after about 6 months, but steep trails and long distances should be avoided initially. 

Ensuring you've received approval from your surgeon or care team before starting to bike is crucial, as they can provide personalized advice based on your recovery progress.
0.6825 when can I go biking Stationary cycling could be introduced as early as 2 to 3 months post-surgery, offering a safe way to build strength without putting too much strain on your knee. Transitioning to outdoor biking on flat surfaces might be possible after about 6 months, but steep trails and long distances should be avoided initially. 

Ensuring you've received approval from your surgeon or care team before starting to bike is crucial, as they can provide personalized advice b

In [24]:
df

Unnamed: 0,id,title,embedding
0,1,When can I start driving again? Driving after ...,"[-0.0066839782521128654, -0.03569670766592026,..."
1,2,When can I drive? Driving after a total knee r...,"[-0.008440850302577019, -0.04061223939061165, ..."
2,3,When will I be able to drive again? Driving af...,"[-0.009732662700116634, -0.03572031483054161, ..."
3,4,My knee still hurts but am I able to drive? Dr...,"[-0.020311227068305016, -0.02924209088087082, ..."
4,5,How long after surgery can I drive? Driving af...,"[-0.010605815798044205, -0.036168962717056274,..."
...,...,...,...
711,712,I need to enter my Range of Motion Range of M...,"[-0.016848498955368996, 0.0123526556417346, 0...."
712,713,Enter Range of Motion Range of Motion is show...,"[-0.017624136060476303, 0.014032304286956787, ..."
713,714,How should I ask you questions to get good ans...,"[0.008159597404301167, 0.017265047878026962, 0..."
714,715,How should I format my questions Thank you for...,"[0.017281178385019302, 0.024744654074311256, 0..."


In [25]:
df.to_csv("Orva_Embed_File.csv")