From 3c6bfd4f0769cf4a54351f938a6b2587b77553b2 Mon Sep 17 00:00:00 2001 From: Andrea Ovalle <74880762+ovalle15@users.noreply.github.com> Date: Tue, 28 Feb 2023 09:15:37 -0500 Subject: [PATCH] Temp custom embeddings/metadata --- examples/basics/custom_embeddings.ipynb | 414 ++++++++++++++++++++++++ 1 file changed, 414 insertions(+) create mode 100644 examples/basics/custom_embeddings.ipynb diff --git a/examples/basics/custom_embeddings.ipynb b/examples/basics/custom_embeddings.ipynb new file mode 100644 index 000000000..8caffd10d --- /dev/null +++ b/examples/basics/custom_embeddings.ipynb @@ -0,0 +1,414 @@ +{ + "cells": [ + { + "metadata": { + "id": "QO7ZPwqmHS4E" + }, + "source": [ + "\n", + " \n", + "" + ], + "cell_type": "markdown" + }, + { + "metadata": { + "id": "1vZLw51VHS4F" + }, + "source": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "" + ], + "cell_type": "markdown" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LLXAULV7YX4e" + }, + "source": [ + "# Documentation\n", + "Please read this document before getting started. \n", + "https://docs.google.com/document/d/1C_zZFGNjXq10P1MvEX6MM0TC7HHrkFOp9BB0P_S_2MQ" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fE4tguJNY-N_" + }, + "source": [ + "# Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRIdzkYf7T18" + }, + "outputs": [], + "source": [ + "# labelbox\n", + "!pip3 install -q labelbox[data]\n", + "import labelbox as lb\n", + "#ndjson\n", + "!pip3 install -q ndjson\n", + "import ndjson" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iYA58iij8CRY" + }, + "source": [ + "# Install the wheel from Github" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9k82ueIu8Dy1", + "outputId": "cc728790-fc62-4d8d-a3e7-94739ffec809" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "# for custom embeddings\n", + "!pip3 install -q 'git+https://github.com/Labelbox/advlib.git'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "h9b_d8Mc6_Ge" + }, + "source": [ + "# Labelbox Credentials" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BQsGpzXK65tc" + }, + "outputs": [], + "source": [ + "API_KEY = \"\"\n", + "client = lb.Client(API_KEY)\n", + "\n", + "# set LABELBOX_API_KEY in bash\n", + "%env LABELBOX_API_KEY=$API_KEY\n", + "# sanity check it worked\n", + "!echo $LABELBOX_API_KEY" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YFBI5rSM7P5I" + }, + "source": [ + "# Select data rows in Labelbox for custom embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tOIyo5pC7PTz" + }, + "outputs": [], + "source": [ + "# get images from a Labelbox dataset\n", + "dataset = client.get_dataset(\"\")\n", + "drs = list(dataset.export_data_rows(timeout_seconds=9999))\n", + "data_row_ids = [dr.uid for dr in drs]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Gwufn4oh9e9i" + }, + "source": [ + "# Create the payload for custom embeddings\n", + "It should be a .ndjson file\n", + "It does not have to be created through python." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iJFGf0w7swnW" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "nb_data_rows = len(data_row_ids)\n", + "# generate 1000 custom embedding vectors, of dimension 2048 each\n", + "# Labelbox supports custom embeddings of dimension up to 2048\n", + "custom_embeddings = [list(np.random.random(2048)) for _ in range(nb_data_rows)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "j8gNNDmG4E8O" + }, + "outputs": [], + "source": [ + "# create the ndjson payload for custom embeddings\n", + "payload = []\n", + "for data_row_id,custom_embedding in zip(data_row_ids,custom_embeddings):\n", + " payload.append({\"id\": data_row_id, \"vector\": custom_embedding})\n", + "\n", + "print('payload', len(payload),payload[:1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "u0ZgybLK67n0", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "d644f81a-014e-4de9-913a-74211972e9b2" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Nb of custom embedding vectors in sanity_check_payload: 1000\n", + "sanity_check_payload: " + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "IOPub data rate exceeded.\n", + "The notebook server will temporarily stop sending output\n", + "to the client in order to avoid crashing it.\n", + "To change this limit, set the config variable\n", + "`--NotebookApp.iopub_data_rate_limit`.\n", + "\n", + "Current values:\n", + "NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n", + "NotebookApp.rate_limit_window=3.0 (secs)\n", + "\n" + ] + } + ], + "source": [ + "# convert payload to ndjson file\n", + "with open('payload.ndjson', 'w') as f:\n", + " ndjson.dump(payload, f)\n", + "\n", + "# sanity check that you can read/load the file and the payload is correct\n", + "with open('payload.ndjson') as f:\n", + " sanity_check_payload = ndjson.load(f)\n", + "\n", + "print(\"Nb of custom embedding vectors in sanity_check_payload: \", len(sanity_check_payload))\n", + "# print(\"sanity_check_payload: \", sanity_check_payload)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ygF0PYg19ibw" + }, + "source": [ + "# Pick an existing custom embedding, or create a custom embedding" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YQeCS_U98BD2", + "outputId": "178dc3be-6e89-4df8-ec3d-2fa6dacc0be0" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "00000000-0000-0000-0000-000000000000 - Image Embedding (CLIP ViT-B/32) - dims: 512 \n", + "00000000-0000-0000-0000-000000000001 - Text embedding (All-MPNet-base-v2) - dims: 768 \n", + "521eadfe-f8e9-4135-9ead-fef8e9713546 - my_custom_embedding_2048_dimensions - dims: 2048 \n", + "a03948c1-151a-4a1a-b948-c1151a6a1a1d - ResNet50_2048_dimensions - dims: 2048 \n", + "baf8856a-e5f7-4781-b885-6ae5f7b78192 - my_custom_embedding - dims: 8 \n" + ] + } + ], + "source": [ + "# See all custom embeddings available\n", + "!advtool embeddings list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "spyHzkLP67dI", + "outputId": "21b6fda1-7a38-4bd5-d244-dfc90b8a5090" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Embedding type created id=521eadfe-f8e9-4135-9ead-fef8e9713546\n" + ] + } + ], + "source": [ + "# # Create a new custom embedding\n", + "!advtool embeddings create my_custom_embedding_2048_dimensions 2048\n", + "# will return the ID of the newly created embedding, e.g. 0ddc5d5c-0963-41ad-9c5d-5c0963a1ad98" + ] + }, + { + "cell_type": "code", + "source": [ + "# # Delete a custom embedding\n", + "# !advtool embeddings delete 521eadfe-f8e9-4135-9ead-fef8e9713546" + ], + "metadata": { + "id": "MafxKyncxyvR" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CdVc42el9p74" + }, + "source": [ + "# Upload the payload to Labelbox" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "twDd5XNM67Zo", + "outputId": "a7715fe7-3fc3-43d0-8316-bbc45a7dee60" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Uploading file: ./payload.ndjson \n", + "Progress: 100.0%\n", + "Check 'advtool embeddings count ' for total searchable embeddings\n" + ] + } + ], + "source": [ + "# Upload the payload to Labelbox\n", + "!advtool embeddings import 521eadfe-f8e9-4135-9ead-fef8e9713546 ./payload.ndjson" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wC0eeEPM9aAM", + "outputId": "5889b2f8-1a07-4748-b3bf-efab545f1417" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0\n" + ] + } + ], + "source": [ + "# count how many data rows have a specific custom embedding\n", + "!advtool embeddings count 521eadfe-f8e9-4135-9ead-fef8e9713546" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5AKDMJfO9Z51", + "outputId": "b3b6e7ca-1e99-4563-d8fe-038375008b69" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "1000\n" + ] + } + ], + "source": [ + "print(len(payload))" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}