From 3c6bfd4f0769cf4a54351f938a6b2587b77553b2 Mon Sep 17 00:00:00 2001
From: Andrea Ovalle <74880762+ovalle15@users.noreply.github.com>
Date: Tue, 28 Feb 2023 09:15:37 -0500
Subject: [PATCH] Temp custom embeddings/metadata
---
examples/basics/custom_embeddings.ipynb | 414 ++++++++++++++++++++++++
1 file changed, 414 insertions(+)
create mode 100644 examples/basics/custom_embeddings.ipynb
diff --git a/examples/basics/custom_embeddings.ipynb b/examples/basics/custom_embeddings.ipynb
new file mode 100644
index 000000000..8caffd10d
--- /dev/null
+++ b/examples/basics/custom_embeddings.ipynb
@@ -0,0 +1,414 @@
+{
+ "cells": [
+ {
+ "metadata": {
+ "id": "QO7ZPwqmHS4E"
+ },
+ "source": [
+ "
\n",
+ " \n",
+ " | "
+ ],
+ "cell_type": "markdown"
+ },
+ {
+ "metadata": {
+ "id": "1vZLw51VHS4F"
+ },
+ "source": [
+ "\n",
+ " \n",
+ " | \n",
+ "\n",
+ "\n",
+ " \n",
+ " | "
+ ],
+ "cell_type": "markdown"
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "LLXAULV7YX4e"
+ },
+ "source": [
+ "# Documentation\n",
+ "Please read this document before getting started. \n",
+ "https://docs.google.com/document/d/1C_zZFGNjXq10P1MvEX6MM0TC7HHrkFOp9BB0P_S_2MQ"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "fE4tguJNY-N_"
+ },
+ "source": [
+ "# Imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "wRIdzkYf7T18"
+ },
+ "outputs": [],
+ "source": [
+ "# labelbox\n",
+ "!pip3 install -q labelbox[data]\n",
+ "import labelbox as lb\n",
+ "#ndjson\n",
+ "!pip3 install -q ndjson\n",
+ "import ndjson"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "iYA58iij8CRY"
+ },
+ "source": [
+ "# Install the wheel from Github"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "9k82ueIu8Dy1",
+ "outputId": "cc728790-fc62-4d8d-a3e7-94739ffec809"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
+ " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
+ " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n"
+ ]
+ }
+ ],
+ "source": [
+ "# for custom embeddings\n",
+ "!pip3 install -q 'git+https://github.com/Labelbox/advlib.git'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "h9b_d8Mc6_Ge"
+ },
+ "source": [
+ "# Labelbox Credentials"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "BQsGpzXK65tc"
+ },
+ "outputs": [],
+ "source": [
+ "API_KEY = \"\"\n",
+ "client = lb.Client(API_KEY)\n",
+ "\n",
+ "# set LABELBOX_API_KEY in bash\n",
+ "%env LABELBOX_API_KEY=$API_KEY\n",
+ "# sanity check it worked\n",
+ "!echo $LABELBOX_API_KEY"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "YFBI5rSM7P5I"
+ },
+ "source": [
+ "# Select data rows in Labelbox for custom embeddings"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "tOIyo5pC7PTz"
+ },
+ "outputs": [],
+ "source": [
+ "# get images from a Labelbox dataset\n",
+ "dataset = client.get_dataset(\"\")\n",
+ "drs = list(dataset.export_data_rows(timeout_seconds=9999))\n",
+ "data_row_ids = [dr.uid for dr in drs]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Gwufn4oh9e9i"
+ },
+ "source": [
+ "# Create the payload for custom embeddings\n",
+ "It should be a .ndjson file\n",
+ "It does not have to be created through python."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "iJFGf0w7swnW"
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "\n",
+ "nb_data_rows = len(data_row_ids)\n",
+ "# generate 1000 custom embedding vectors, of dimension 2048 each\n",
+ "# Labelbox supports custom embeddings of dimension up to 2048\n",
+ "custom_embeddings = [list(np.random.random(2048)) for _ in range(nb_data_rows)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "j8gNNDmG4E8O"
+ },
+ "outputs": [],
+ "source": [
+ "# create the ndjson payload for custom embeddings\n",
+ "payload = []\n",
+ "for data_row_id,custom_embedding in zip(data_row_ids,custom_embeddings):\n",
+ " payload.append({\"id\": data_row_id, \"vector\": custom_embedding})\n",
+ "\n",
+ "print('payload', len(payload),payload[:1])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "u0ZgybLK67n0",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "d644f81a-014e-4de9-913a-74211972e9b2"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Nb of custom embedding vectors in sanity_check_payload: 1000\n",
+ "sanity_check_payload: "
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "IOPub data rate exceeded.\n",
+ "The notebook server will temporarily stop sending output\n",
+ "to the client in order to avoid crashing it.\n",
+ "To change this limit, set the config variable\n",
+ "`--NotebookApp.iopub_data_rate_limit`.\n",
+ "\n",
+ "Current values:\n",
+ "NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n",
+ "NotebookApp.rate_limit_window=3.0 (secs)\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# convert payload to ndjson file\n",
+ "with open('payload.ndjson', 'w') as f:\n",
+ " ndjson.dump(payload, f)\n",
+ "\n",
+ "# sanity check that you can read/load the file and the payload is correct\n",
+ "with open('payload.ndjson') as f:\n",
+ " sanity_check_payload = ndjson.load(f)\n",
+ "\n",
+ "print(\"Nb of custom embedding vectors in sanity_check_payload: \", len(sanity_check_payload))\n",
+ "# print(\"sanity_check_payload: \", sanity_check_payload)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ygF0PYg19ibw"
+ },
+ "source": [
+ "# Pick an existing custom embedding, or create a custom embedding"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "YQeCS_U98BD2",
+ "outputId": "178dc3be-6e89-4df8-ec3d-2fa6dacc0be0"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "00000000-0000-0000-0000-000000000000 - Image Embedding (CLIP ViT-B/32) - dims: 512 \n",
+ "00000000-0000-0000-0000-000000000001 - Text embedding (All-MPNet-base-v2) - dims: 768 \n",
+ "521eadfe-f8e9-4135-9ead-fef8e9713546 - my_custom_embedding_2048_dimensions - dims: 2048 \n",
+ "a03948c1-151a-4a1a-b948-c1151a6a1a1d - ResNet50_2048_dimensions - dims: 2048 \n",
+ "baf8856a-e5f7-4781-b885-6ae5f7b78192 - my_custom_embedding - dims: 8 \n"
+ ]
+ }
+ ],
+ "source": [
+ "# See all custom embeddings available\n",
+ "!advtool embeddings list"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "spyHzkLP67dI",
+ "outputId": "21b6fda1-7a38-4bd5-d244-dfc90b8a5090"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Embedding type created id=521eadfe-f8e9-4135-9ead-fef8e9713546\n"
+ ]
+ }
+ ],
+ "source": [
+ "# # Create a new custom embedding\n",
+ "!advtool embeddings create my_custom_embedding_2048_dimensions 2048\n",
+ "# will return the ID of the newly created embedding, e.g. 0ddc5d5c-0963-41ad-9c5d-5c0963a1ad98"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# # Delete a custom embedding\n",
+ "# !advtool embeddings delete 521eadfe-f8e9-4135-9ead-fef8e9713546"
+ ],
+ "metadata": {
+ "id": "MafxKyncxyvR"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "CdVc42el9p74"
+ },
+ "source": [
+ "# Upload the payload to Labelbox"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "twDd5XNM67Zo",
+ "outputId": "a7715fe7-3fc3-43d0-8316-bbc45a7dee60"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Uploading file: ./payload.ndjson \n",
+ "Progress: 100.0%\n",
+ "Check 'advtool embeddings count ' for total searchable embeddings\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Upload the payload to Labelbox\n",
+ "!advtool embeddings import 521eadfe-f8e9-4135-9ead-fef8e9713546 ./payload.ndjson"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "wC0eeEPM9aAM",
+ "outputId": "5889b2f8-1a07-4748-b3bf-efab545f1417"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "0\n"
+ ]
+ }
+ ],
+ "source": [
+ "# count how many data rows have a specific custom embedding\n",
+ "!advtool embeddings count 521eadfe-f8e9-4135-9ead-fef8e9713546"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "5AKDMJfO9Z51",
+ "outputId": "b3b6e7ca-1e99-4563-d8fe-038375008b69"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "1000\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(payload))"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}