Labelbox · kkim-labelbox · Jul 17, 2023 · Jul 12, 2023 · Jul 14, 2023
diff --git a/examples/basics/custom_embeddings.ipynb b/examples/basics/custom_embeddings.ipynb
@@ -31,7 +31,7 @@
       "metadata": {},
       "source": [
         "# Documentation\n",
-        "Please read this document before getting started. \n",
+        "Please read this document before getting started.\n",
         "https://docs.google.com/document/d/1C_zZFGNjXq10P1MvEX6MM0TC7HHrkFOp9BB0P_S_2MQ"
       ],
       "cell_type": "markdown"
@@ -46,7 +46,6 @@
     {
       "metadata": {},
       "source": [
-        "# labelbox\n",
         "!pip3 install -q \"labelbox[data]\""
       ],
       "cell_type": "code",
@@ -56,7 +55,7 @@
     {
       "metadata": {},
       "source": [
-        "import labelbox as lb \n",
+        "import labelbox as lb\n",
         "import numpy as np\n",
         "import json"
       ],
@@ -74,7 +73,6 @@
     {
       "metadata": {},
       "source": [
-        "# for custom embeddings\n",
         "!pip3 install -q 'git+https://github.com/Labelbox/advlib.git'"
       ],
       "cell_type": "code",
@@ -115,9 +113,10 @@
       "source": [
         "# get images from a Labelbox dataset\n",
         "# Our systems start to process data after 1000 embeddings of each type, for this demo make sure your dataset is over 1000 data rows\n",
-        "dataset = client.get_dataset(\"<ADD YOUR DATASET ID>\")\n",
+        "dataset = client.get_dataset(\"<ADD YOUR DATASET ID>\") \n",
         "drs = list(dataset.export_data_rows(timeout_seconds=9999))\n",
-        "data_row_ids = [dr.uid for dr in drs]"
+        "data_row_ids = [dr.uid for dr in drs]\n",
+        "data_row_ids = data_row_ids[:1000] # keep the first 1000 examples for the sake of this demo"
       ],
       "cell_type": "code",
       "outputs": [],
@@ -127,19 +126,19 @@
       "metadata": {},
       "source": [
         "# Create the payload for custom embeddings\n",
-        "It should be a .ndjson file\n",
-        "It does not have to be created through python."
+        "It should be a .ndjson file.   \n",
+        "Every line is a json file that finishes with a \\n character.  \n",
+        "It does not have to be created through python.  "
       ],
       "cell_type": "markdown"
     },
     {
       "metadata": {},
       "source": [
-        "\n",
-        "\n",
         "nb_data_rows = len(data_row_ids)\n",
-        "# generate 1000 custom embedding vectors, of dimension 2048 each\n",
-        "# Labelbox supports custom embeddings of dimension up to 2048\n",
+        "print(\"Number of data rows: \", nb_data_rows)\n",
+        "# generate random vectors, of dimension 2048 each\n",
+        "# Labelbox supports custom embedding vectors of dimension up to 2048\n",
         "custom_embeddings = [list(np.random.random(2048)) for _ in range(nb_data_rows)]"
       ],
       "cell_type": "code",
@@ -149,7 +148,7 @@
     {
       "metadata": {},
       "source": [
-        "# create the ndjson payload for custom embeddings\n",
+        "# create the payload for custom embeddings\n",
         "payload = []\n",
         "for data_row_id,custom_embedding in zip(data_row_ids,custom_embeddings):\n",
         "  payload.append({\"id\": data_row_id, \"vector\": custom_embedding})\n",
@@ -163,19 +162,28 @@
     {
       "metadata": {},
       "source": [
-        "# convert payload to ndjson file\n",
+        "# delete any pre-existing file\n",
+        "import os\n",
+        "if os.path.exists(\"payload.ndjson\"):\n",
+        "  os.remove(\"payload.ndjson\")\n",
         "\n",
+        "# convert the payload to json file\n",
         "with open('payload.ndjson', 'w') as f:\n",
-        "    sanity_check_payload = json.dump(payload, f)\n",
-        "\n",
-        "\n",
+        "  for p in payload:\n",
+        "    f.write(json.dumps(p) + \"\\n\")\n",
+        "    # sanity_check_payload = json.dump(payload, f)"
+      ],
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": [
         "# sanity check that you can read/load the file and the payload is correct\n",
         "with open('payload.ndjson') as f:\n",
-        "    sanity_check_payload = json.load(f)\n",
-        "    \n",
-        "\n",
-        "print(\"Nb of custom embedding vectors in sanity_check_payload: \", len(sanity_check_payload))\n",
-        "# print(\"sanity_check_payload: \", sanity_check_payload)"
+        "    sanity_check_payload = [json.loads(l) for l in f.readlines()]\n",
+        "print(\"Nb of custom embedding vectors in sanity_check_payload: \", len(sanity_check_payload))"
       ],
       "cell_type": "code",
       "outputs": [],
@@ -184,7 +192,7 @@
     {
       "metadata": {},
       "source": [
-        "# See all custom embeddings available\n",
+        "# See all custom embeddings available in your Labelbox workspace\n",
         "!advtool embeddings list"
       ],
       "cell_type": "code",
@@ -194,9 +202,9 @@
     {
       "metadata": {},
       "source": [
-        "# # Create a new custom embedding\n",
+        "# # Create a new custom embedding, unless you want to re-use one\n",
         "!advtool embeddings create my_custom_embedding_2048_dimensions 2048\n",
-        "# will return the ID of the newly created embedding, e.g. cgbjjt5ra07710005liytdf19"
+        "# this command will return the ID of the newly created embedding, e.g. ciqtgd94607290000ljx4dvh2"
       ],
       "cell_type": "code",
       "outputs": [],
@@ -206,7 +214,7 @@
       "metadata": {},
       "source": [
         "# # Delete a custom embedding\n",
-        "# !advtool embeddings delete cj7j0ukre0771000blj4qnxgn"
+        "# !advtool embeddings delete ciqtgd94607290000ljx4dvh2"
       ],
       "cell_type": "code",
       "outputs": [],
@@ -222,8 +230,8 @@
     {
       "metadata": {},
       "source": [
-        "# Upload the payload to Labelbox \n",
-        "!advtool embeddings import cj7j0ukre0771000blj4qnxgn ./payload.ndjson"
+        "# Replace the current id with the newly generated id from the previous step, or any existing custom embedding id.\n",
+        "!advtool embeddings import c933bviqn0756000elk07et77 ./payload.ndjson"
       ],
       "cell_type": "code",
       "outputs": [],
@@ -240,7 +248,7 @@
       "metadata": {},
       "source": [
         "# count how many data rows have a specific custom embedding (This can take a couple of minutes)\n",
-        "!advtool embeddings count cj7j0ukre0771000blj4qnxgn"
+        "!advtool embeddings count c933bviqn0756000elk07et77"
       ],
       "cell_type": "code",
       "outputs": [],