<td>
   <a target="_blank" href="https://labelbox.com" ><img src="https://labelbox.com/blog/content/images/2021/02/logo-v4.svg" width=256/></a>
</td>

<td>
<a href="https://colab.research.google.com/github/Labelbox/labelbox-python/blob/master/examples/basics/custom_embeddings.ipynb" target="_blank"><img
src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>
</td>

<td>
<a href="https://github.com/Labelbox/labelbox-python/tree/master/examples/basics/custom_embeddings.ipynb" target="_blank"><img
src="https://img.shields.io/badge/GitHub-100000?logo=github&logoColor=white" alt="GitHub"></a>
</td>

# Documentation
Please read this document before getting started.
https://docs.google.com/document/d/1C_zZFGNjXq10P1MvEX6MM0TC7HHrkFOp9BB0P_S_2MQ

# Imports

In [None]:
!pip3 install -q "labelbox[data]"

In [None]:
import labelbox as lb
import numpy as np
import json

# Install the wheel from Github

In [None]:
!pip3 install -q 'git+https://github.com/Labelbox/advlib.git'

# Labelbox Credentials

In [None]:
API_KEY = "<ADD YOUR LABELBOX CREDENTIALS>"
client = lb.Client(API_KEY)

# set LABELBOX_API_KEY in bash
%env LABELBOX_API_KEY=$API_KEY
# sanity check it worked
!echo $LABELBOX_API_KEY

# Select data rows in Labelbox for custom embeddings

In [None]:
# get images from a Labelbox dataset
# Our systems start to process data after 1000 embeddings of each type, for this demo make sure your dataset is over 1000 data rows
dataset = client.get_dataset("<ADD YOUR DATASET ID>") 
drs = list(dataset.export_data_rows(timeout_seconds=9999))
data_row_ids = [dr.uid for dr in drs]
data_row_ids = data_row_ids[:1000] # keep the first 1000 examples for the sake of this demo

# Create the payload for custom embeddings
It should be a .ndjson file.   
Every line is a json file that finishes with a \n character.  
It does not have to be created through python.  

In [None]:
nb_data_rows = len(data_row_ids)
print("Number of data rows: ", nb_data_rows)
# generate random vectors, of dimension 2048 each
# Labelbox supports custom embedding vectors of dimension up to 2048
custom_embeddings = [list(np.random.random(2048)) for _ in range(nb_data_rows)]

In [None]:
# create the payload for custom embeddings
payload = []
for data_row_id,custom_embedding in zip(data_row_ids,custom_embeddings):
  payload.append({"id": data_row_id, "vector": custom_embedding})

print('payload', len(payload),payload[:1])

In [None]:
# delete any pre-existing file
import os
if os.path.exists("payload.ndjson"):
  os.remove("payload.ndjson")

# convert the payload to json file
with open('payload.ndjson', 'w') as f:
  for p in payload:
    f.write(json.dumps(p) + "\n")
    # sanity_check_payload = json.dump(payload, f)

In [None]:
# sanity check that you can read/load the file and the payload is correct
with open('payload.ndjson') as f:
    sanity_check_payload = [json.loads(l) for l in f.readlines()]
print("Nb of custom embedding vectors in sanity_check_payload: ", len(sanity_check_payload))

In [None]:
# See all custom embeddings available in your Labelbox workspace
!advtool embeddings list

In [None]:
# # Create a new custom embedding, unless you want to re-use one
!advtool embeddings create my_custom_embedding_2048_dimensions 2048
# this command will return the ID of the newly created embedding, e.g. ciqtgd94607290000ljx4dvh2

In [None]:
# # Delete a custom embedding
# !advtool embeddings delete ciqtgd94607290000ljx4dvh2

# Upload the payload to Labelbox

In [None]:
# Replace the current id with the newly generated id from the previous step, or any existing custom embedding id.
!advtool embeddings import c933bviqn0756000elk07et77 ./payload.ndjson

# Pick an existing custom embedding, or create a custom embedding

In [None]:
# count how many data rows have a specific custom embedding (This can take a couple of minutes)
!advtool embeddings count c933bviqn0756000elk07et77

In [None]:
print(len(payload))