# Semantic Matching with Matching Engine and Palm

[COVID-19 Open Research Dataset Challenge (CORD-19)
](https://www.kaggle.com/datasets/allen-institute-for-ai/CORD-19-research-challenge)

## Setup 

In [8]:
import os
import json

from IPython import display
import pandas as pd
from google.cloud import aiplatform
from vertexai.language_models import TextEmbeddingModel

In [4]:
REGION = "us-central1"
PROJECT = !(gcloud config get-value core/project)
PROJECT = PROJECT[0]
BUCKET = f"{PROJECT}-cord19-matching"

# Do not change these
os.environ["PROJECT"] = PROJECT
os.environ["BUCKET"] = BUCKET
os.environ["REGION"] = REGION

In [5]:
!gsutil ls gs://{BUCKET} || gsutil mb -l {REGION} gs://{BUCKET}

BucketNotFoundException: 404 gs://dherin-dev-cord19-matching bucket does not exist.
Creating gs://dherin-dev-cord19-matching/...


## Loading the data

In [6]:
metadata = pd.read_csv('../data/cord19_metadata_sample.csv.gz')
metadata.head()

Unnamed: 0,title,abstract,url
0,Ethnobotanical and ethnomedicinal analysis of ...,Algerian people largely rely on traditional me...,https://www.ncbi.nlm.nih.gov/pubmed/34131369/;...
1,1.9 Adolescents in Crisis: Psychological Impac...,,https://doi.org/10.1016/j.jaac.2021.09.022; ht...
2,Myopericarditis in a previously healthy adoles...,We report the case of a previously healthy 16‐...,https://www.ncbi.nlm.nih.gov/pubmed/34133825/;...
3,Religious Support as a Contribution to Face th...,Coping with the COVID-19 pandemic has required...,https://www.ncbi.nlm.nih.gov/pubmed/33405093/;...
4,The urgency of resuming disrupted dog rabies v...,OBJECTIVE: Dog vaccination is a cost-effective...,http://medrxiv.org/cgi/content/short/2021.04.2...


## Creating the embeddings

In [9]:
model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

In [10]:
MAX_BATCH_SIZE = 5
vectors = []

for i in range(0, len(metadata), MAX_BATCH_SIZE):
    batch = metadata.abstract[i: i + MAX_BATCH_SIZE]
    embeddings = model.get_embeddings(batch)
    vectors.extend([embedding.values for embedding in embeddings])

## Creating the matching engine input file

In [12]:
embeddings_file_path = "cord19_embeddings.json"

# Removing the embedding file if it already exists
!test -f {embeddings_file_path} && rm {embeddings_file_path}

with open(embeddings_file_path, 'a') as embeddings_file:    
    for i, embedding in enumerate(vectors):
        json_line = json.dumps(
            {
                "id": i,
                "embedding": embedding
            }
        ) + '\n'
        embeddings_file.writelines(json_line)

In [13]:
EMBEDDINGS_URI = f"gs://{BUCKET}"

!gsutil cp {embeddings_file_path} {EMBEDDINGS_URI}

Copying file://cord19_embeddings.json [Content-Type=application/json]...
- [1 files][ 64.9 MiB/ 64.9 MiB]                                                
Operation completed over 1 objects/64.9 MiB.                                     


## Creating the matching engine index

In [None]:
DISPLAY_NAME = "cord19-palm-embeddings"

matching_engine_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=DISPLAY_NAME,
    contents_delta_uri=EMBEDDINGS_URI,
    dimensions=len(vectors[0]),
    approximate_neighbors_count=150,
    distance_measure_type="COSINE_DISTANCE",
    leaf_node_embedding_count=500,
    leaf_nodes_to_search_percent=7,
    description=DISPLAY_NAME,
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/115851500182/locations/us-central1/indexes/6107013036109725696/operations/263368975138684928


In [None]:
INDEX_RESOURCE_NAME = tree_ah_index.resource_name

print(INDEX_RESOURCE_NAME)

In [None]:
matching_engine_index = aiplatform.MatchingEngineIndex(index_name=INDEX_RESOURCE_NAME)

In [None]:
matching_engine_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=DISPLAY_NAME,
    description=DISPLAY_NAME,
    public_endpoint_enabled=True,
)

In [None]:
DEPLOYED_INDEX_ID = "cord19-deployed-index"

In [None]:
matching_engine = matching_engine_endpoint.deploy_index(
    index=matching_engine_index, deployed_index_id=DEPLOYED_INDEX_ID
)

matching_engine.deployed_indexes

## Querying Matching Engine

In [None]:
QUERY = "SARS-CoV-2"

text_embeddings = [
    vector.values 
    for vector in model.get_embeddings([QUERY])
]

In [None]:
# Define number of neighbors to return
NUM_NEIGHBORS = 20

response = matching_engine.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=text_embeddings,
    num_neighbors=NUM_NEIGHBORS,
)

response

In [None]:
matched_ids = [int(match.id) for match in response[0]]
matched_distances = [match.distance for match in response[0]]
matched_titles = [metadata.title[i] for i in matched_ids]
matched_abstracts = [metadata.abstract[i] for i in matched_ids]
matched_urls = [metadata.url[i] for i in matched_ids]

matches = pd.DataFrame({
    "distance": matched_distances,
    "title": matched_titles,
    "abstract": matched_abstracts,
    "url": matched_urls
})
matches

In [None]:
html = "<html><body><ol>"
for i in range(len(matches)):
    html += f"""            
    <li> 
        <article>
            <header>
                <a href="{matches.url[i]}"> <h2>{matches.title[i]}</h2></a>
            </header>
            <p>{matches.abstract[i]}</p>
        </article>
    </li>
    """
html += "</body></html>"
display.HTML(html)

## Cleaning Up

In [None]:
my_index_endpoint.delete(force=True)
tree_ah_index.delete()

Copyright 2023 Google Inc.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.