<td>
   <a target="_blank" href="https://labelbox.com" ><img src="https://labelbox.com/blog/content/images/2021/02/logo-v4.svg" width=256/></a>
</td>

<td>
<a href="https://colab.research.google.com/github/Labelbox/labelbox-python/blob/master/examples/integrations/huggingface/huggingface.ipynb" target="_blank"><img
src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>
</td>

<td>
<a href="https://github.com/Labelbox/labelbox-python/tree/master/examples/integrations/huggingface/huggingface.ipynb" target="_blank"><img
src="https://img.shields.io/badge/GitHub-100000?logo=github&logoColor=white" alt="GitHub"></a>
</td>

# Imports

In [None]:
# for labelbox
!pip3 install -q labelbox[data]
import labelbox as lb
# for custom embeddings in Labelbox
!pip3 install -q 'git+https://github.com/Labelbox/advlib.git'
#ndjson
!pip3 install -q ndjson
import ndjson
import time
import json

# Labelbox Credentials

In [None]:
API_KEY = ""
client = lb.Client(API_KEY)

# set LABELBOX_API_KEY in bash
%env LABELBOX_API_KEY=$API_KEY
# sanity check it worked
!echo $LABELBOX_API_KEY

# Select data rows in Labelbox for custom embeddings

In [None]:
client.enable_experimental = True

# get images from a Labelbox dataset
# Our systems start to process data after 1000 embeddings of each type, for this demo make sure your dataset is over 1000 data rows
dataset = client.get_dataset("<Data Set ID>")

export_task = dataset.export()
export_task.wait_till_done()

In [None]:
data_rows = []

def json_stream_handler(output: lb.JsonConverterOutput):
  data_row = json.loads(output.json_str)
  data_rows.append(data_row)

if export_task.has_errors():
  export_task.get_stream(
  converter=lb.JsonConverter(),
  stream_type=lb.StreamType.ERRORS
  ).start(stream_handler=lambda error: print(error))

if export_task.has_result():
  export_json = export_task.get_stream(
    converter=lb.JsonConverter(),
    stream_type=lb.StreamType.RESULT
  ).start(stream_handler=json_stream_handler)

In [None]:
data_row_ids = [dr["data_row"]["id"] for dr in data_rows]
data_row_urls = [dr["data_row"]["row_data"] for dr in data_rows]

# sanity check
print('number of data rows:',len(data_row_ids))
print('data_row_ids[0]',data_row_ids[0])
print('data_row_urls[0]',data_row_urls[0])

# Get a HuggingFace Model to generate custom embeddings

In [None]:
# import HuggingFace
!pip3 install -q transformers
!pip3 install -q timm

# load a neural network from HuggingFace 
import transformers
transformers.logging.set_verbosity(50)
import torch
import torch.nn.functional as F
import PIL, requests
from tqdm import tqdm

# get ResNet-50
image_processor = transformers.AutoImageProcessor.from_pretrained("microsoft/resnet-50")
model = transformers.ResNetModel.from_pretrained("microsoft/resnet-50")

# Pick an existing custom embedding in Labelbox, or create a custom embedding

In [None]:
# See all custom embeddings available to your org
!advtool embeddings list

# Create a new custom embedding if needed
!advtool embeddings create ResNet50_2048_dimensions 2048 # anything between 8 and 2048 dimensions are supported
# This returns the ID of the newly created embedding, e.g. 0ddc5d5c-0963-41ad-9c5d-5c0963a1ad98

# Generate and upload custom embeddings
We generate and upload custom embeddings, in batches of 512 images 
Labelbox.


In [None]:
custom_embeddings = {}
print_debug = True
batch_size = 512

# iterate over images in batches of size 512
for i in range(0, len(data_row_urls), batch_size):

  try:
      print('iteration: ',i)
      start = time.time()

      # chunk of images in the batch
      data_row_urls_chunk = data_row_urls[i:i+batch_size]
      data_row_ids_chunk = data_row_ids[i:i+batch_size]
      # download images
      imgs = [PIL.Image.open(requests.get(data_row_url, stream=True).raw).convert('RGB').resize((224, 224)) for data_row_url in data_row_urls_chunk]
      # process images
      img_hf = image_processor(imgs, return_tensors="pt")
      # generate resnet embeddings, thanks to inference
      with torch.no_grad():
        last_layer = model(**img_hf, output_hidden_states=True).last_hidden_state
      # max pool to reduce dimensionality
      resnet_embeddings = F.adaptive_avg_pool2d(last_layer, (1, 1))
      resnet_embeddings = torch.flatten(resnet_embeddings,start_dim=1,end_dim=3) # flatten custom embedding

      # convert resnet embeddings, from pytorch to python lists
      resnet_embeddings = resnet_embeddings.tolist()

      # Store resnet embeddings in NDJson file
      payload = []
      for (data_row_id,resnet_embedding) in zip(data_row_ids_chunk,resnet_embeddings):
        payload.append({"id": data_row_id, "vector": resnet_embedding})
      # store ndjson on disk. it takes take too much memory to keep them all in memory...
      # the NDJson file will be the payload for custom embeddings
      with open('payload.ndjson', 'w') as f:
          ndjson.dump(payload, f)

      # Upload the NDJson file to Labelbox
      !advtool embeddings import a03948c1-151a-4a1a-b948-c1151a6a1a1d ./payload.ndjson

      end = time.time()
      print('time taken for iteration: ',end-start)

  except Exception:
      print('error: ', i)
      continue  # or you could use 'pass'


# Check the upload went well

In [None]:
# count how many data rows have a specific custom embedding
!advtool embeddings count a03948c1-151a-4a1a-b948-c1151a6a1a1d