# Install libraries

In [None]:
!pip install google-cloud-aiplatform



# Authenticate Colab

In [None]:
import sys

if 'google.colab' in sys.modules:
    from google.colab import auth as google_auth
    google_auth.authenticate_user()

In [None]:
PROJECT_ID = 'solutions-2023-mar-107' # @param {type:"string"}
REGION = 'us-central1' # @param {type:"string"}
#BUCKET_URI = "gs://vector_search_regional/flipkart_multimodal_embeddings" # @param {type:"string"} # WHERE EMBEDDINGS ARE STORED
BUCKET_URI = 'gs://vector_search_regional/test_filterings'# @param {type:"string"}
ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)

In [None]:
from google.cloud import aiplatform
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)
from google.cloud import bigquery
client = bigquery.Client(PROJECT_ID)

# Prepare Data to create Vector Search Index with filters

Sample data in json format to add filters in category level:


{"id": "43_T”, "embedding": [0.6, 1.0],
"restricts": [
{"namespace": “L0”, "allow": c0_name },
{“namespace": “L1”, "allow": c1_name},
{“namespace": “L2”, "allow": c2_name},
{“namespace": “L3”, "allow": c3_name}
]
}


{"id": "43_I”, "embedding": [0.6, 1.0],
"restricts": [
{"namespace": “L0”, "allow": c0_name },
{“namespace": “L1”, "allow": c1_name},
{“namespace": “L2”, "allow": c2_name},
{“namespace": “L3”, "allow": c3_name}
]
}

In [None]:
from google.cloud import bigquery
client = bigquery.Client(PROJECT_ID)
query_job = client.query("""
   SELECT *
   FROM `flipkart.training_embeddings_with_cat_for_filtering`;""")

results = query_job.result() # Wait for the job to complete.

In [None]:
data = results.to_dataframe()

In [None]:
data

Unnamed: 0,id,embedding,L0,L1,L2,L3
0,9f56d6e1481d35f9677a69a983aa81ee_T,"[0.0305967834, -0.0117346784, -0.0113393087, -...",Furniture,Pet Furniture,,
1,efb5a934cefac9456baef772f5d97f52_T,"[0.017123403, -0.0647764653, 0.0103280917, 0.0...",Footwear,Women's Footwear,REMSON INDIA Women Flats,
2,abd4482126e006bd6f3ce9825e9449bc_T,"[0.0181311239, -0.0240292437, 0.00982811209, -...",Mobiles & Accessories,Tablet Accessories,Cases & Covers,MannMohh Cases & Covers
3,635df3139a893577a21554cd4ace1f35_T,"[0.0425712056, -0.0244968068, 0.00395903178, 0...",Mobiles & Accessories,Tablet Accessories,Cases & Covers,kasemantra Cases & Covers
4,135524f18e0ac6e3d008fa81b1576dfa_T,"[-0.0492946468, -0.0495145917, 0.0119116539, -...",Clothing,Women's Clothing,Western Wear,"Shirts, Tops & Tunics"
...,...,...,...,...,...,...
36369,dad7943c3791dfb2d669942e61dfc25d_I,"[-0.00362257822, 0.0448172726, 0.0305924937, 0...",Home Decor & Festive Needs,Showpieces,Ona'S Showpieces,
36370,906b3a5912453ffced2dbd0fc4bd495c_I,"[-0.013985265, 0.0400636345, 0.0220041461, -0....",Footwear,Women's Footwear,Casual Shoes,Boots
36371,2fa79e6a06305fa2ea23a343841b78c3_I,"[-0.0203494355, 0.0481191799, 0.00561492844, -...",Clothing,Women's Clothing,Western Wear,"Shirts, Tops & Tunics"
36372,084ae0b12e0672abfc7f9d125bd1e15b_I,"[-0.0286370851, -0.0289862268, -0.00473337155,...",Computers,Network Components,Routers,Onnet Routers


In [None]:
def adding_filters(data):

  restricts = []

  def add_restricts(col_name):
    namespace = col_name
    allow = []
    allow.append(data[col_name])
    restricts.append({'namespace': namespace, 'allow': allow})

  if data['L0']:
    add_restricts('L0')
    if data['L1']:
      add_restricts('L1')
      if data['L2']:
        add_restricts('L2')
        if data['L3']:
          add_restricts('L3')

  data['restricts']= restricts
  return data


data = data.apply(adding_filters, axis=1)

In [None]:
import json

data[['id','embedding','restricts']].to_json('sample.json', orient='records', lines=True)

In [None]:
#Upload json into GCS
!gsutil cp -r sample.json gs://vector_search_regional/test_filterings

Copying file://sample.json [Content-Type=application/json]...
/ [0 files][    0.0 B/658.0 MiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

- [1 files][658.0 MiB/658.0 MiB]   46.1 MiB/s                                   
Operation completed over 1 objects/658.0 MiB.                                    


In [None]:
BUCKET_URI

'gs://vector_search_regional/test_filterings'

# Prepare Data to update Vector Search Index with filters

Sample data in json format to add filters in category level:


{"id": "43_T”, "embedding": [0.6, 1.0],
"restricts": [
{"namespace": “L0”, "allow": c0_name },
{“namespace": “L1”, "allow": c1_name},
{“namespace": “L2”, "allow": c2_name},
{“namespace": “L3”, "allow": c3_name}
]
}


{"id": "43_I”, "embedding": [0.6, 1.0],
"restricts": [
{"namespace": “L0”, "allow": c0_name },
{“namespace": “L1”, "allow": c1_name},
{“namespace": “L2”, "allow": c2_name},
{“namespace": “L3”, "allow": c3_name}
]
}

**Steps**
* Read the test dataset from BQ
* Match the category column names to index
* Filter & keep only required columns
* Upload this df to bq
* Run the query to get the format accepted by Vectore search
* Save the results in json format into gcs bucket
* Use that bucket as contentsDeltaUri and update the index

Read the test dataset from BQ

In [None]:
#Lets read few entries from test dataset & convert into desired format
from google.cloud import bigquery
client = bigquery.Client(PROJECT_ID)
query_job = client.query("""
   SELECT *
   FROM `solutions-2023-mar-107.flipkart.flipkart_golden_test`;""")

res = query_job.result() # Wait for the job to complete.

In [None]:
test_data = res.to_dataframe()

In [None]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 173 non-null    object
 1   description        173 non-null    object
 2   L0                 173 non-null    object
 3   L1                 173 non-null    object
 4   L2                 161 non-null    object
 5   L3                 120 non-null    object
 6   image_uri          173 non-null    object
 7   text_embedding     173 non-null    object
 8   image_embedding    173 non-null    object
 9   manual_validation  173 non-null    Int64 
 10  train_val_count    173 non-null    Int64 
dtypes: Int64(2), object(9)
memory usage: 15.3+ KB


Match the category column names to index

In [None]:
test_data.rename(columns={'c0_name':'L0','c1_name':'L1',"c2_name":'L2','c3_name':'L3'},inplace=True)

In [None]:
test_data.head()

Unnamed: 0,id,description,L0,L1,L2,L3,image_uri,text_embedding,image_embedding,manual_validation,train_val_count
0,dbdac18a8ee5a8a48238b9685c96e90a,Timewel 1100-N1949_S Analog Watch - For Women ...,Watches,Wrist Watches,Timewel Wrist Watches,,gs://genai-product-catalog/flipkart_20k_oct26/...,"[0.0173654296, -0.0533204265, -0.0123991454, 0...","[0.00376954675, 0.0619891584, -0.0349791907, 0...",1,1
1,8a771d8dfa97d06278038945dfe6b936,Chappin & Nellson CNL-50-White Analog Watch - ...,Watches,Wrist Watches,Chappin & Nellson Wrist Watches,,gs://genai-product-catalog/flipkart_20k_oct26/...,"[0.0395723879, -0.046936553, -0.0225308761, 0....","[-0.0022665204, 0.0369679928, -0.00598961441, ...",1,1
2,894904e26516d491bf1c7711fe800e78,"Only Kidz 20600 Digital Watch - For Boys, Girl...",Watches,Wrist Watches,Only Kidz Wrist Watches,,gs://genai-product-catalog/flipkart_20k_oct26/...,"[0.0234976951, -0.0287721325, -0.00390096451, ...","[0.0150629, 0.039315924, -0.0305093117, -0.008...",1,1
3,138f8455457c6cf87a0b94e132c485a8,Gift Island SW13392C Fashionin Digital Watch -...,Watches,Wrist Watches,Gift Island Wrist Watches,,gs://genai-product-catalog/flipkart_20k_oct26/...,"[0.0199023429, -0.0493029393, -0.0240550581, 0...","[0.0141698951, 0.038551461, -0.00148873532, 0....",1,1
4,7c973b8fb2069b2142aea3473b70c213,"Key Features of Sakhi Styles Men, Boys Materia...",Watches,Watch Accessories,Wrist Bands,Sakhi Styles Wrist Bands,gs://genai-product-catalog/flipkart_20k_oct26/...,"[0.00237481017, -0.0487776175, 0.0100880247, 0...","[0.00239414768, 0.0396352187, -0.0179609824, 0...",1,1


Filter & keep only required columns

In [None]:
test_df = test_data[['id','text_embedding','image_embedding','L0','L1','L2','L3']][:10]

Upload this df to bq

In [None]:
from google.cloud import bigquery

def create_table(client, table_id, schema):
    table = bigquery.Table(table_id, schema=schema)
    table = client.create_table(table,exists_ok=True)  # Make an API request
    print(
        "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
    )
def upload_df_into_bq(client, table_id, df):
    #df.to_gbq(table_id, PROJECT, if_exists='replace', progress_bar=True)
    job_config = bigquery.LoadJobConfig(schema=schema)
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
    #job_config.skip_leading_rows = 1
    job_config.autodetect = False
    #job_config.source_format = 'CSV'
    job = client.load_table_from_dataframe(df, table_id, job_config=job_config)
    job.result()
    print(
        "Uploaded dataframe into table {}.{}".format(PROJECT, table_id)
    )

In [None]:
#PROJECT = 'solutions-2023-mar-107'
#LOCATION = 'us-central1'
table_id = 'solutions-2023-mar-107.flipkart.test_data_for_index_update'

schema = [
    bigquery.SchemaField('id', 'STRING', mode='REQUIRED'),
    bigquery.SchemaField('L0', 'STRING', mode='NULLABLE'),
    bigquery.SchemaField('L1', 'STRING', mode='NULLABLE'),
    bigquery.SchemaField('L2', 'STRING', mode='NULLABLE'),
    bigquery.SchemaField('L3', 'STRING', mode='NULLABLE'),
    bigquery.SchemaField('text_embedding', 'FLOAT', mode='REPEATED'),
    bigquery.SchemaField('image_embedding', 'FLOAT', mode='REPEATED')
]
client = bigquery.Client(PROJECT_ID)

create_table(client, table_id, schema)

upload_df_into_bq(client, table_id, test_df)


Created table solutions-2023-mar-107.flipkart.test_data_for_index_update
Uploaded dataframe into table solutions-2023-mar-107.solutions-2023-mar-107.flipkart.test_data_for_index_update


**Query to be run**

SELECT CONCAT(id,'_T') as id, text_embedding as embedding, L0, L1, L2, L3

FROM `<PROJECT_ID>.<DATASET_ID>.<TABLE_ID>`

UNION ALL

SELECT CONCAT(id,'_I') as embedding, image_embedding, L0, L1, L2, L3

FROM `<PROJECT_ID>.<DATASET_ID>.<TABLE_ID>`;

In [None]:
from google.cloud import bigquery
client = bigquery.Client(PROJECT_ID)
query_job = client.query("""
  SELECT CONCAT(id,'_T') as id, text_embedding as embedding, L0, L1, L2, L3

  FROM `solutions-2023-mar-107.flipkart.test_data_for_index_update`

  UNION ALL

  SELECT CONCAT(id,'_I') as embedding, image_embedding, L0, L1, L2, L3

  FROM `solutions-2023-mar-107.flipkart.test_data_for_index_update`;
   """)

res = query_job.result() # Wait for the job to complete.

In [None]:
res = res.to_dataframe()

In [None]:
res

Unnamed: 0,id,embedding,L0,L1,L2,L3
0,dbdac18a8ee5a8a48238b9685c96e90a_T,"[0.0173654296, -0.0533204265, -0.0123991454, 0...",Watches,Wrist Watches,Timewel Wrist Watches,
1,8a771d8dfa97d06278038945dfe6b936_T,"[0.0395723879, -0.046936553, -0.0225308761, 0....",Watches,Wrist Watches,Chappin & Nellson Wrist Watches,
2,894904e26516d491bf1c7711fe800e78_T,"[0.0234976951, -0.0287721325, -0.00390096451, ...",Watches,Wrist Watches,Only Kidz Wrist Watches,
3,138f8455457c6cf87a0b94e132c485a8_T,"[0.0199023429, -0.0493029393, -0.0240550581, 0...",Watches,Wrist Watches,Gift Island Wrist Watches,
4,7c973b8fb2069b2142aea3473b70c213_T,"[0.00237481017, -0.0487776175, 0.0100880247, 0...",Watches,Watch Accessories,Wrist Bands,Sakhi Styles Wrist Bands
5,81d73f4a7add96d46146ac4e192aad92_T,"[0.00450034346, -0.0472053625, 0.0043839491, -...",Clothing,Kids' Clothing,Girls Wear,Innerwear & Sleepwear
6,140225e6d36138c0c79f4b97d42456bd_T,"[-0.0411048084, -0.0358454958, 0.0511916205, 0...",Clothing,Men's Clothing,T-Shirts,Ocean Race T-Shirts
7,9ac56e95bf79b7a4268387b4c8efdd52_T,"[0.00935253594, -0.0527491495, 0.0230338033, 0...",Clothing,Men's Clothing,T-Shirts,Nimya T-Shirts
8,35c289ac8c50c49fae6d06e37ce34d42_T,"[-0.0280462336, -0.0645570904, 0.0218884815, 0...",Clothing,Men's Clothing,Shirts,Casual & Party Wear Shirts
9,fef6a5aa8c590c8029bbc11903cbd554_T,"[0.0218668692, -0.0588314161, 0.00145428523, 0...",Clothing,Men's Clothing,T-Shirts,Nimya T-Shirts


In [None]:
res.to_json('test_data_for_index_update_10datapoints.json',orient='records', lines='true')

In [None]:
INPUT_DIR = 'gs://vector_search_regional/flipkart_batch_update'

In [None]:
!gsutil cp test_data_for_index_update_10datapoints.json $INPUT_DIR

Copying file://test_data_for_index_update_10datapoints.json [Content-Type=application/json]...
/ [1 files][368.6 KiB/368.6 KiB]                                                
Operation completed over 1 objects/368.6 KiB.                                    
