Skip to content
Merged

SN-69 #1172

Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 37 additions & 29 deletions examples/basics/custom_embeddings.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"metadata": {},
"source": [
"# Documentation\n",
"Please read this document before getting started. \n",
"Please read this document before getting started.\n",
"https://docs.google.com/document/d/1C_zZFGNjXq10P1MvEX6MM0TC7HHrkFOp9BB0P_S_2MQ"
],
"cell_type": "markdown"
Expand All @@ -46,7 +46,6 @@
{
"metadata": {},
"source": [
"# labelbox\n",
"!pip3 install -q \"labelbox[data]\""
],
"cell_type": "code",
Expand All @@ -56,7 +55,7 @@
{
"metadata": {},
"source": [
"import labelbox as lb \n",
"import labelbox as lb\n",
"import numpy as np\n",
"import json"
],
Expand All @@ -74,7 +73,6 @@
{
"metadata": {},
"source": [
"# for custom embeddings\n",
"!pip3 install -q 'git+https://github.com/Labelbox/advlib.git'"
],
"cell_type": "code",
Expand Down Expand Up @@ -115,9 +113,10 @@
"source": [
"# get images from a Labelbox dataset\n",
"# Our systems start to process data after 1000 embeddings of each type, for this demo make sure your dataset is over 1000 data rows\n",
"dataset = client.get_dataset(\"<ADD YOUR DATASET ID>\")\n",
"dataset = client.get_dataset(\"<ADD YOUR DATASET ID>\") \n",
"drs = list(dataset.export_data_rows(timeout_seconds=9999))\n",
"data_row_ids = [dr.uid for dr in drs]"
"data_row_ids = [dr.uid for dr in drs]\n",
"data_row_ids = data_row_ids[:1000] # keep the first 1000 examples for the sake of this demo"
],
"cell_type": "code",
"outputs": [],
Expand All @@ -127,19 +126,19 @@
"metadata": {},
"source": [
"# Create the payload for custom embeddings\n",
"It should be a .ndjson file\n",
"It does not have to be created through python."
"It should be a .ndjson file. \n",
"Every line is a json file that finishes with a \\n character. \n",
"It does not have to be created through python. "
],
"cell_type": "markdown"
},
{
"metadata": {},
"source": [
"\n",
"\n",
"nb_data_rows = len(data_row_ids)\n",
"# generate 1000 custom embedding vectors, of dimension 2048 each\n",
"# Labelbox supports custom embeddings of dimension up to 2048\n",
"print(\"Number of data rows: \", nb_data_rows)\n",
"# generate random vectors, of dimension 2048 each\n",
"# Labelbox supports custom embedding vectors of dimension up to 2048\n",
"custom_embeddings = [list(np.random.random(2048)) for _ in range(nb_data_rows)]"
],
"cell_type": "code",
Expand All @@ -149,7 +148,7 @@
{
"metadata": {},
"source": [
"# create the ndjson payload for custom embeddings\n",
"# create the payload for custom embeddings\n",
"payload = []\n",
"for data_row_id,custom_embedding in zip(data_row_ids,custom_embeddings):\n",
" payload.append({\"id\": data_row_id, \"vector\": custom_embedding})\n",
Expand All @@ -163,19 +162,28 @@
{
"metadata": {},
"source": [
"# convert payload to ndjson file\n",
"# delete any pre-existing file\n",
"import os\n",
"if os.path.exists(\"payload.ndjson\"):\n",
" os.remove(\"payload.ndjson\")\n",
"\n",
"# convert the payload to json file\n",
"with open('payload.ndjson', 'w') as f:\n",
" sanity_check_payload = json.dump(payload, f)\n",
"\n",
"\n",
" for p in payload:\n",
" f.write(json.dumps(p) + \"\\n\")\n",
" # sanity_check_payload = json.dump(payload, f)"
],
"cell_type": "code",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"source": [
"# sanity check that you can read/load the file and the payload is correct\n",
"with open('payload.ndjson') as f:\n",
" sanity_check_payload = json.load(f)\n",
" \n",
"\n",
"print(\"Nb of custom embedding vectors in sanity_check_payload: \", len(sanity_check_payload))\n",
"# print(\"sanity_check_payload: \", sanity_check_payload)"
" sanity_check_payload = [json.loads(l) for l in f.readlines()]\n",
"print(\"Nb of custom embedding vectors in sanity_check_payload: \", len(sanity_check_payload))"
],
"cell_type": "code",
"outputs": [],
Expand All @@ -184,7 +192,7 @@
{
"metadata": {},
"source": [
"# See all custom embeddings available\n",
"# See all custom embeddings available in your Labelbox workspace\n",
"!advtool embeddings list"
],
"cell_type": "code",
Expand All @@ -194,9 +202,9 @@
{
"metadata": {},
"source": [
"# # Create a new custom embedding\n",
"# # Create a new custom embedding, unless you want to re-use one\n",
"!advtool embeddings create my_custom_embedding_2048_dimensions 2048\n",
"# will return the ID of the newly created embedding, e.g. cgbjjt5ra07710005liytdf19"
"# this command will return the ID of the newly created embedding, e.g. ciqtgd94607290000ljx4dvh2"
],
"cell_type": "code",
"outputs": [],
Expand All @@ -206,7 +214,7 @@
"metadata": {},
"source": [
"# # Delete a custom embedding\n",
"# !advtool embeddings delete cj7j0ukre0771000blj4qnxgn"
"# !advtool embeddings delete ciqtgd94607290000ljx4dvh2"
],
"cell_type": "code",
"outputs": [],
Expand All @@ -222,8 +230,8 @@
{
"metadata": {},
"source": [
"# Upload the payload to Labelbox \n",
"!advtool embeddings import cj7j0ukre0771000blj4qnxgn ./payload.ndjson"
"# Replace the current id with the newly generated id from the previous step, or any existing custom embedding id.\n",
"!advtool embeddings import c933bviqn0756000elk07et77 ./payload.ndjson"
],
"cell_type": "code",
"outputs": [],
Expand All @@ -240,7 +248,7 @@
"metadata": {},
"source": [
"# count how many data rows have a specific custom embedding (This can take a couple of minutes)\n",
"!advtool embeddings count cj7j0ukre0771000blj4qnxgn"
"!advtool embeddings count c933bviqn0756000elk07et77"
],
"cell_type": "code",
"outputs": [],
Expand Down