From 8712feb0da2d8b685441d8e0e3c618346150c7a3 Mon Sep 17 00:00:00 2001 From: Andrea Ovalle <74880762+ovalle15@users.noreply.github.com> Date: Thu, 1 Dec 2022 13:08:44 -0500 Subject: [PATCH] Params' updates Notebook used to create this pr : https://colab.research.google.com/drive/1qdxNUj7kyOLDaFaygI5-q5dYFP37louu?usp=sharing#scrollTo=6ffec272 --- examples/annotation_import/pdf.ipynb | 792 +++++++++++---------------- 1 file changed, 332 insertions(+), 460 deletions(-) diff --git a/examples/annotation_import/pdf.ipynb b/examples/annotation_import/pdf.ipynb index cb04c273a..afd5b29c5 100644 --- a/examples/annotation_import/pdf.ipynb +++ b/examples/annotation_import/pdf.ipynb @@ -1,473 +1,345 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "7ede57bf", - "metadata": {}, - "source": [ - "\n", - " \n", - "" - ] - }, - { - "cell_type": "markdown", - "id": "e2c69e42", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "id": "cef9c510", - "metadata": {}, - "source": [ - "# PDF Annotation Import\n", - "* This notebook will provide examples of each supported annotation type for PDF assets. It will cover the following:\n", - " * Model-Assisted Labeling (MAL) - used to provide pre-annotated data for your labelers. This will enable a reduction in the total amount of time to properly label your assets. Model-assisted labeling does not submit the labels automatically, and will need to be reviewed by a labeler for submission." - ] - }, - { - "cell_type": "markdown", - "id": "8c877b9c", - "metadata": {}, - "source": [ - "* For information on what types of annotations are supported per data type, refer to this documentation:\n", - " * https://docs.labelbox.com/docs/model-assisted-labeling#option-1-import-via-python-annotation-types-recommended" - ] - }, - { - "cell_type": "markdown", - "id": "e7e5d296", - "metadata": {}, - "source": [ - "* Notes:\n", - " * Wait until the import job is complete before opening the Editor to make sure all annotations are imported properly." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8d8d554f", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install -q 'labelbox[data]'" - ] - }, - { - "cell_type": "markdown", - "id": "573525c5", - "metadata": {}, - "source": [ - "# Imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "e3522d4b", - "metadata": {}, - "outputs": [], - "source": [ - "from labelbox.schema.ontology import OntologyBuilder, Tool, Classification, Option\n", - "from labelbox import Client, LabelingFrontend, MALPredictionImport\n", - "from labelbox.data.annotation_types import (\n", - " Label, ImageData, ObjectAnnotation, \n", - " Rectangle, Point,\n", - " Radio, Checklist, Text,\n", - " ClassificationAnnotation, ClassificationAnswer\n", - ")\n", - "from labelbox.data.serialization import NDJsonConverter\n", - "from labelbox.schema.media_type import MediaType\n", - "import uuid\n", - "import json" - ] - }, - { - "cell_type": "markdown", - "id": "e3036435", - "metadata": {}, - "source": [ - "# API Key and Client\n", - "Provide a valid api key below in order to properly connect to the Labelbox Client." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "6ffec272", - "metadata": {}, - "outputs": [], - "source": [ - "# Add your api key\n", - "API_KEY = \"YOUR API KEY\"\n", - "client = Client(api_key=API_KEY)" - ] - }, - { - "cell_type": "markdown", - "id": "70c4d820", - "metadata": {}, - "source": [ - "---- \n", - "### Steps\n", - "1. Make sure project is setup\n", - "2. Collect annotations\n", - "3. Upload" - ] - }, - { - "cell_type": "markdown", - "id": "5d7fc082", - "metadata": {}, - "source": [ - "### Project setup" - ] - }, - { - "cell_type": "markdown", - "id": "d0bc3c39", - "metadata": {}, - "source": [ - "First, we create an ontology with all the possible tools and classifications supported for PDF. The official list of supported annotations to import can be found here:\n", - "- [Model-Assisted Labeling](https://docs.labelbox.com/docs/model-assisted-labeling) (annotations/labels are not submitted)\n", - "- [PDF Annotations](https://docs.labelbox.com/docs/document-annotations)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "f93aebd8", - "metadata": {}, - "outputs": [], - "source": [ - "ontology_builder = OntologyBuilder(\n", - " tools=[ \n", - " Tool( # Bounding Box tool given the name \"box\"\n", - " tool=Tool.Type.BBOX, \n", - " name=\"box\")], \n", - " classifications=[ \n", - " Classification( # Text classification given the name \"text\"\n", - " class_type=Classification.Type.TEXT,\n", - " instructions=\"text\"), \n", - " Classification( # Checklist classification given the name \"text\" with two options: \"first_checklist_answer\" and \"second_checklist_answer\"\n", - " class_type=Classification.Type.CHECKLIST, \n", - " instructions=\"checklist\", \n", - " options=[\n", - " Option(value=\"first_checklist_answer\"),\n", - " Option(value=\"second_checklist_answer\") \n", - " ]\n", - " ), \n", - " Classification( # Radio classification given the name \"text\" with two options: \"first_radio_answer\" and \"second_radio_answer\"\n", - " class_type=Classification.Type.RADIO, \n", - " instructions=\"radio\", \n", - " options=[\n", - " Option(value=\"first_radio_answer\"),\n", - " Option(value=\"second_radio_answer\")\n", - " ]\n", - " )\n", - " ]\n", - ")" - ] + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } }, - { - "cell_type": "code", - "execution_count": 5, - "id": "e290fd6e", - "metadata": {}, - "outputs": [ + "cells": [ { - "data": { - "text/plain": [ - "OntologyBuilder(tools=[Tool(tool=, name='box', required=False, color=None, classifications=[], schema_id=None, feature_schema_id=None)], classifications=[Classification(class_type=, instructions='text', required=False, options=[], schema_id=None, feature_schema_id=None, scope=None), Classification(class_type=, instructions='checklist', required=False, options=[Option(value='first_checklist_answer', label='first_checklist_answer', schema_id=None, feature_schema_id=None, options=[]), Option(value='second_checklist_answer', label='second_checklist_answer', schema_id=None, feature_schema_id=None, options=[])], schema_id=None, feature_schema_id=None, scope=None), Classification(class_type=, instructions='radio', required=False, options=[Option(value='first_radio_answer', label='first_radio_answer', schema_id=None, feature_schema_id=None, options=[]), Option(value='second_radio_answer', label='second_radio_answer', schema_id=None, feature_schema_id=None, options=[])], schema_id=None, feature_schema_id=None, scope=None)])" + "cell_type": "markdown", + "metadata": { + "id": "7ede57bf" + }, + "source": [ + "\n", + " \n", + "" ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ontology_builder" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "27961645", - "metadata": {}, - "outputs": [], - "source": [ - "# Create two Labelbox projects\n", - "mal_project = client.create_project(name=\"pdf_mal_project\", media_type=MediaType.Document)\n", - "\n", - "# Create one Labelbox dataset\n", - "dataset = client.create_dataset(name=\"pdf_annotation_import_demo_dataset\")\n", - "\n", - "# Grab an example image and create a Labelbox data row\n", - "test_pdf_url = \"https://www.buds.com.ua/images/Lorem_ipsum.pdf\"\n", - "data_row = dataset.create_data_row(row_data=test_pdf_url)\n", - "\n", - "# Setup your ontology / labeling editor\n", - "editor = next(client.get_labeling_frontends(where=LabelingFrontend.name == \"Editor\")) # Unless using a custom editor, do not modify this\n", - "\n", - "mal_project.setup(editor, ontology_builder.asdict()) # Connect your ontology and editor to your MAL project\n", - "mal_project.datasets.connect(dataset) # Connect your dataset to your MAL project" - ] - }, - { - "cell_type": "markdown", - "id": "db26d55d", - "metadata": {}, - "source": [ - "### Create Label using Annotation Type Objects\n", - "* It is recommended to use the Python SDK's annotation types for importing into Labelbox." - ] - }, - { - "cell_type": "markdown", - "id": "f409d3cc", - "metadata": {}, - "source": [ - "### Object Annotations" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "5cbe5b5a", - "metadata": {}, - "outputs": [], - "source": [ - "box_annotation = ObjectAnnotation(\n", - " name='box', \n", - " extra={'unit': 'POINTS','page': 0}, #pages are 0-indexed, 0 indicates page 1 \n", - " value=Rectangle(\n", - " extra={}, \n", - " start=Point(x=557,y=898),\n", - " end=Point(x=852,y=1140)\n", - " ))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "4896ea42", - "metadata": {}, - "outputs": [], - "source": [ - "text_annotation=ClassificationAnnotation(\n", - " value=Text( # String value for the text annotation\n", - " answer=\"the answer to the text question\" \n", - " ), \n", - " name=\"text\" # Name of the classification in your ontology\n", - ")\n", - "\n", - "checklist_annotation=ClassificationAnnotation(\n", - " value=Checklist(\n", - " answer=[ # List of the checklist answers in your ontology\n", - " ClassificationAnswer(name=\"first_checklist_answer\"),\n", - " ClassificationAnswer(name=\"second_checklist_answer\")\n", - " ]\n", - " ), \n", - " name=\"checklist\" # Name of the classification in your ontology\n", - ")\n", - "\n", - "radio_annotation=ClassificationAnnotation(\n", - " value=Radio(\n", - " answer=ClassificationAnswer(\n", - " name=\"second_radio_answer\" # Name of the radio answer in your ontology\n", - " )\n", - " ), \n", - " name=\"radio\" # Name of the classification in your ontology\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "0f79ecc9", - "metadata": {}, - "source": [ - "### Create a Label object with all of our annotations" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "64ac1f74", - "metadata": {}, - "outputs": [ + }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonathantso/opt/anaconda3/lib/python3.8/site-packages/labelbox/data/annotation_types/classification/classification.py:85: UserWarning: Dropdown classification is deprecated and will be removed in a future release\n", - " warnings.warn(\"Dropdown classification is deprecated and will be \"\n" - ] + "cell_type": "markdown", + "metadata": { + "id": "e2c69e42" + }, + "source": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "" + ] }, { - "data": { - "text/plain": [ - "{'uid': None,\n", - " 'data': ImageData(im_bytes=None,file_path=None,url=None,arr=None),\n", - " 'annotations': [ObjectAnnotation(name='box', feature_schema_id=None, extra={'unit': 'POINTS', 'page': 0}, value=Rectangle(extra={}, start=Point(extra={}, x=557.0, y=898.0), end=Point(extra={}, x=852.0, y=1140.0)), classifications=[]),\n", - " ClassificationAnnotation(name='text', feature_schema_id=None, extra={}, value=Text(answer='the answer to the text question')),\n", - " ClassificationAnnotation(name='checklist', feature_schema_id=None, extra={}, value=Checklist(name='checklist', answer=[ClassificationAnswer(name='first_checklist_answer', feature_schema_id=None, extra={}, keyframe=None), ClassificationAnswer(name='second_checklist_answer', feature_schema_id=None, extra={}, keyframe=None)])),\n", - " ClassificationAnnotation(name='radio', feature_schema_id=None, extra={}, value=Radio(answer=ClassificationAnswer(name='second_radio_answer', feature_schema_id=None, extra={}, keyframe=None)))],\n", - " 'extra': {}}" + "cell_type": "markdown", + "metadata": { + "id": "cef9c510" + }, + "source": [ + "# PDF Annotation Import" ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Create a Label object by identifying the applicable data row in Labelbox and providing a list of annotations\n", - "label = Label(\n", - " data=ImageData(\n", - " uid=data_row.uid),\n", - " annotations = [\n", - " box_annotation, \n", - " text_annotation, checklist_annotation, radio_annotation\n", - " ]\n", - ")\n", - "\n", - "\n", - "label.__dict__" - ] - }, - { - "cell_type": "markdown", - "id": "c7988155", - "metadata": {}, - "source": [ - "### Model Assisted Labeling " - ] - }, - { - "cell_type": "markdown", - "id": "b33ef622", - "metadata": {}, - "source": [ - "To do model-assisted labeling, we need to convert a Label object into an NDJSON. \n", - "\n", - "This is easily done with using the NDJSONConverter class\n", - "\n", - "Notes:\n", - "* the NDJsonConverter takes in a list of labels" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "8794d8aa", - "metadata": { - "scrolled": false - }, - "outputs": [ + }, { - "data": { - "text/plain": [ - "[{'uuid': '4a1ca900-85d7-4643-883e-277170000f9c',\n", - " 'dataRow': {'id': 'cl7hrmmi90m5c0y6h5nk00u02'},\n", - " 'name': 'box',\n", - " 'page': 0,\n", - " 'unit': 'POINTS',\n", - " 'classifications': [],\n", - " 'bbox': {'top': 898.0, 'left': 557.0, 'height': 242.0, 'width': 295.0}},\n", - " {'name': 'text',\n", - " 'answer': 'the answer to the text question',\n", - " 'uuid': '35588dd3-9050-4471-ac13-a8ed6ac1fd4a',\n", - " 'dataRow': {'id': 'cl7hrmmi90m5c0y6h5nk00u02'}},\n", - " {'name': 'checklist',\n", - " 'uuid': '38bc716c-d199-4e6e-bd31-0d2128716444',\n", - " 'dataRow': {'id': 'cl7hrmmi90m5c0y6h5nk00u02'},\n", - " 'answer': [{'name': 'first_checklist_answer'},\n", - " {'name': 'second_checklist_answer'}]},\n", - " {'name': 'radio',\n", - " 'answer': {'name': 'second_radio_answer'},\n", - " 'uuid': '8a21a248-9669-4465-b0c0-5a8d75903c8f',\n", - " 'dataRow': {'id': 'cl7hrmmi90m5c0y6h5nk00u02'}}]" + "cell_type": "markdown", + "metadata": { + "id": "e7e5d296" + }, + "source": [ + "* Notes:\n", + " * Wait until the import job is complete before opening the Editor to make sure all annotations are imported properly." ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Convert our label from a Labelbox class object to the underlying NDJSON format required for upload - uploads can be directly built in this syntax as well\n", - "mal_ndjson = list(NDJsonConverter.serialize([label]))\n", - "mal_ndjson" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "787524ac", - "metadata": {}, - "outputs": [], - "source": [ - "# Upload our label using Model-Assisted Labeling\n", - "upload_job = MALPredictionImport.create_from_objects(\n", - " client = client, \n", - " project_id = mal_project.uid, \n", - " name=\"mal_job\", \n", - " predictions=mal_ndjson)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "eb5116f9", - "metadata": {}, - "outputs": [ + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8d8d554f" + }, + "outputs": [], + "source": [ + "!pip install -q 'labelbox[data]'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "573525c5" + }, + "source": [ + "# Imports" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Errors: []\n" - ] + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "e3522d4b" + }, + "outputs": [], + "source": [ + "from labelbox.schema.ontology import OntologyBuilder, Tool, Classification, Option\n", + "from labelbox import Client, LabelingFrontend, MALPredictionImport\n", + "from labelbox.data.annotation_types import (\n", + " Label, ImageData, ObjectAnnotation, \n", + " Rectangle, Point,\n", + " Radio, Checklist, Text,\n", + " ClassificationAnnotation, ClassificationAnswer\n", + ")\n", + "from labelbox.data.serialization import NDJsonConverter\n", + "from labelbox.schema.media_type import MediaType\n", + "import uuid\n", + "from uuid import uuid4\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e3036435" + }, + "source": [ + "# API Key and Client\n", + "Provide a valid api key below in order to properly connect to the Labelbox Client." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6ffec272" + }, + "outputs": [], + "source": [ + "# Add your api key\n", + "API_KEY = \"\"\n", + "client = Client(api_key=API_KEY)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "70c4d820" + }, + "source": [ + "---- \n", + "### Steps\n", + "1. Make sure project is setup\n", + "2. Collect annotations\n", + "3. Upload" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5d7fc082" + }, + "source": [ + "### Step 1: Project setup" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d0bc3c39" + }, + "source": [ + "First, we create an ontology with all the possible tools and classifications supported for PDF. The official list of supported annotations to import can be found here:\n", + "- [Model-Assisted Labeling](https://docs.labelbox.com/docs/model-assisted-labeling) (annotations/labels are not submitted)\n", + "- [PDF Annotations](https://docs.labelbox.com/docs/document-annotations)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "f93aebd8" + }, + "outputs": [], + "source": [ + "tool_name = \"super\"\n", + "ontology_builder = OntologyBuilder(\n", + " tools=[ \n", + " Tool( # Entity tool given the name \"NER\"\n", + " tool=Tool.Type.NER, \n", + " name= tool_name)]\n", + " )" + ] + }, + { + "cell_type": "code", + "source": [ + "ontology = client.create_ontology(\"pdf-entity-import-ontology\", ontology_builder.asdict())" + ], + "metadata": { + "id": "I99PTpgwXplA" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Create Labelbox project\n", + "mal_project = client.create_project(name=\"pdf_entity_import\", media_type=MediaType.Document)\n", + "\n", + "# Create one Labelbox dataset\n", + "dataset = client.create_dataset(name=\"pdf_entity_import_dataset\")\n", + "\n", + "# Grab an example pdf and create a Labelbox data row\n", + "asset = [\n", + " {\n", + " \"row_data\": {\n", + " \"pdf_url\": \"https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483.pdf\",\n", + " \"text_layer_url\": \"https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483-lb-textlayer.json\"\n", + " }\n", + " }\n", + "]\n", + "\n", + "task = dataset.create_data_rows(asset)\n", + "task.wait_till_done()\n", + "print(task.errors)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jm1EijjWX2kd", + "outputId": "00579957-0864-4902-9f9d-7767df7e6680" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "WARNING:labelbox.client:Default createProject behavior will soon be adjusted to prefer batch projects. Pass in `queue_mode` parameter explicitly to opt-out for the time being.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "None\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "\n", + "data_row_id = next(dataset.data_rows())\n", + "# Connect your ontology and editor to your MAL project\n", + "mal_project.setup_editor(ontology) \n", + "# Connect your dataset to your MAL project\n", + "batch = mal_project.create_batch('test-batch_' + str(uuid4()), [data_row_id] , 5)" + ], + "metadata": { + "id": "mAilP1_bZp39" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Step 2: create entity annotation" + ], + "metadata": { + "id": "6uDOpm_1asBV" + } + }, + { + "cell_type": "code", + "source": [ + "ANNOTATION = {\n", + " \"uuid\": str(uuid.uuid4()),\n", + " \"name\" : tool_name,\n", + " \"dataRow\": {\"id\": data_row_id.uid},\n", + " \"textSelections\": [\n", + " {\n", + " \"tokenIds\": [\n", + " \"521f705e-b276-4ac7-8e5b-2e38e037f80f\", # superconductivity\n", + " ],\n", + " \"groupId\": \"ed53dd86-ef39-4634-9505-ee0eebedef44\",\n", + " \"page\": 1,\n", + " }\n", + " ],\n", + "}" + ], + "metadata": { + "id": "YJiZANnynGcX" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Step 3 upload annotations" + ], + "metadata": { + "id": "JfDPhY2DriLn" + } + }, + { + "cell_type": "code", + "source": [ + "task = MALPredictionImport.create_from_objects(client, mal_project.uid, str(uuid.uuid4()), [ANNOTATION])" + ], + "metadata": { + "id": "T6btKgt7gF-m" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "task.wait_until_done()" + ], + "metadata": { + "id": "SaGlosSCktD7" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(task.errors)\n", + "print(task.statuses)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kLux4NWrk2eT", + "outputId": "fdfcc68c-c155-4ace-90dc-4698a967edc9" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[]\n", + "[{'uuid': 'dffe7d75-1c61-4dcd-a0d4-804438cf540b', 'dataRow': {'id': 'clb5cdf1c6u0a077d87kffy28'}, 'status': 'SUCCESS'}]\n" + ] + } + ] } - ], - "source": [ - "# Errors will appear for each annotation that failed.\n", - "# Empty list means that there were no errors\n", - "# This will provide information only after the upload_job is complete, so we do not need to worry about having to rerun\n", - "print(\"Errors:\", upload_job.errors)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + ] }