In [None]:
! pip install spacy google-cloud-aiplatform

In [None]:
! python -m spacy download en_core_web_md

In [13]:
# Get data
import spacy
nlp = spacy.load('en_core_web_md')

In [14]:
! wget https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english-no-swears.txt

--2023-01-25 04:05:56--  https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english-no-swears.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75153 (73K) [text/plain]
Saving to: ‘google-10000-english-no-swears.txt’


2023-01-25 04:05:57 (8.79 MB/s) - ‘google-10000-english-no-swears.txt’ saved [75153/75153]



In [15]:
# import string
# string.ascii_letters

# { word for word in list(nlp.vocab.strings) if all([character in string.ascii_letters for character in word])}

In [16]:
with open("google-10000-english-no-swears.txt", "r") as f:
    words = f.readlines()
    
words = list({word.strip() for word in words})

In [17]:
word_vectors = [nlp.vocab[word].vector for word in words]

In [18]:
# import numpy as np

# np.array(word_vectors)

In [20]:
# np.count_nonzero(nlp.vocab["jam"].vector)

In [48]:
import numpy as np
# Convert to embeddings
word_embeddings = []
for word in words:
    embedding = nlp.vocab[word].vector
    
    if not np.count_nonzero(embedding) > 0:
        print(f"Not found: {word}")        
    else:
        word_embeddings.append((word, embedding))

Not found: dealtime
Not found: macromedia
Not found: oclc
Not found: beastality
Not found: vsnet
Not found: knowledgestorm
Not found: nextel
Not found: uniprotkb
Not found: invision
Not found: techrepublic
Not found: freebsd
Not found: kelkoo
Not found: worldcat
Not found: zdnet
Not found: shopzilla
Not found: minolta
Not found: voyuer
Not found: ddr
Not found: sagem
Not found: urw
Not found: xhtml
Not found: gamespot
Not found: cdna
Not found: gpl
Not found: prostores
Not found: trembl
Not found: livecam
Not found: msgstr
Not found: zope
Not found: msie
Not found: thehun
Not found: findarticles
Not found: tft
Not found: enb
Not found: pgp
Not found: ntsc
Not found: sparc
Not found: findlaw
Not found: liechtenstein
Not found: symantec
Not found: fioricet
Not found: looksmart
Not found: hewlett
Not found: verzeichnis
Not found: mpegs
Not found: expansys
Not found: jvc
Not found: pichunter
Not found: nutten
Not found: lycos
Not found: titten
Not found: issn
Not found: listprice
Not found

https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/matching_engine/sdk_matching_engine_for_indexing.ipynb

In [50]:
import json

with open("word_embeddings_formatted.json", "w") as f:
    embeddings_formatted = [
        json.dumps(
            {
                "id": word,
                "embedding": [str(value) for value in embedding],
            }
        )
        + "\n"
        for word, embedding in word_embeddings
    ]
    f.writelines(embeddings_formatted)

In [51]:
PROJECT_ID = "python-docs-samples-tests"
REGION = "us-central1"
BUCKET_URI = "gs://ivanmkc-test2"
DIMENSIONS = len(word_embeddings[0][1])
# DIMENSIONS = 300
DISPLAY_NAME = "10000_common_words_spacy"

In [52]:
EMBEDDINGS_INITIAL_URI = f"{BUCKET_URI}/matching_engine/initial/"
! gsutil cp word_embeddings_formatted.json {EMBEDDINGS_INITIAL_URI}

Copying file://word_embeddings_formatted.json [Content-Type=application/json]...
\ [1 files][ 30.4 MiB/ 30.4 MiB]                                                
Operation completed over 1 objects/30.4 MiB.                                     


In [53]:
# ! gsutil rm -r {EMBEDDINGS_INITIAL_URI}

In [54]:
# Add to matching engine
import os

from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

In [55]:
tree_ah_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=DISPLAY_NAME,
    contents_delta_uri=EMBEDDINGS_INITIAL_URI,
    dimensions=DIMENSIONS,
    approximate_neighbors_count=150,
    distance_measure_type="DOT_PRODUCT_DISTANCE",
    leaf_node_embedding_count=500,
    leaf_nodes_to_search_percent=80,
    description="Spacy ANN index",
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/1012616486416/locations/us-central1/indexes/3975878026503127040/operations/3584114130786713600
MatchingEngineIndex created. Resource name: projects/1012616486416/locations/us-central1/indexes/3975878026503127040
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/1012616486416/locations/us-central1/indexes/3975878026503127040')


In [None]:
tree_ah_index

## Create an IndexEndpoint with VPC Network

In [57]:
# Retrieve the project number
PROJECT_NUMBER = !gcloud projects list --filter="PROJECT_ID:'{PROJECT_ID}'" --format='value(PROJECT_NUMBER)'
PROJECT_NUMBER = PROJECT_NUMBER[0]

VPC_NETWORK = "ucaip-haystack-vpc-network"
VPC_NETWORK_FULL = "projects/{}/global/networks/{}".format(PROJECT_NUMBER, VPC_NETWORK)
VPC_NETWORK_FULL

'projects/1012616486416/global/networks/ucaip-haystack-vpc-network'

In [58]:
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"{DISPLAY_NAME}_endpoint",
    description="10000 common words",
    network=VPC_NETWORK_FULL,
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/1012616486416/locations/us-central1/indexEndpoints/3118505247442468864/operations/874636004969938944
MatchingEngineIndexEndpoint created. Resource name: projects/1012616486416/locations/us-central1/indexEndpoints/3118505247442468864
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/1012616486416/locations/us-central1/indexEndpoints/3118505247442468864')


In [63]:
DEPLOYED_INDEX_ID = "tree_ah_glove_deployed_unique_3"

## Deploy Indexes

In [None]:
my_index_endpoint = my_index_endpoint.deploy_index(
    index=tree_ah_index, deployed_index_id=DEPLOYED_INDEX_ID
)

my_index_endpoint.deployed_indexes

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/1012616486416/locations/us-central1/indexEndpoints/3118505247442468864
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/1012616486416/locations/us-central1/indexEndpoints/3118505247442468864/operations/7529267404363268096


## Make prediction

In [None]:
# my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint("projects/1012616486416/locations/us-central1/indexEndpoints/2844911570079711232")

In [107]:
test = nlp.vocab["steak"].vector.tolist()

In [108]:
# Test query
NUM_NEIGHBOURS = 50

# Test query
response = my_index_endpoint.match(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=[test],
    num_neighbors=NUM_NEIGHBOURS,
)

matches_all = [match for matches in response for match in matches]

In [109]:
sorted(matches_all, key=lambda x: x.distance, reverse=True)

[MatchNeighbor(id='meat', distance=1728.51806640625),
 MatchNeighbor(id='beef', distance=1705.654541015625),
 MatchNeighbor(id='pork', distance=1614.385009765625),
 MatchNeighbor(id='dish', distance=1570.297119140625),
 MatchNeighbor(id='meal', distance=1497.3118896484375),
 MatchNeighbor(id='ham', distance=1426.80859375),
 MatchNeighbor(id='sauce', distance=1418.895751953125),
 MatchNeighbor(id='pie', distance=1406.8028564453125),
 MatchNeighbor(id='eat', distance=1347.900634765625),
 MatchNeighbor(id='cook', distance=1332.3133544921875),
 MatchNeighbor(id='pan', distance=1287.2825927734375),
 MatchNeighbor(id='fish', distance=1245.0908203125),
 MatchNeighbor(id='salad', distance=1227.3583984375),
 MatchNeighbor(id='lamb', distance=1216.9248046875),
 MatchNeighbor(id='ct', distance=1216.5611572265625),
 MatchNeighbor(id='oz', distance=1216.5611572265625),
 MatchNeighbor(id='pcs', distance=1216.5611572265625),
 MatchNeighbor(id='pk', distance=1216.5611572265625),
 MatchNeighbor(id='qt'